# Install the pybullet environment

In [2]:
!pip install pybullet

Collecting pybullet
[?25l  Downloading https://files.pythonhosted.org/packages/d8/ac/a422ab8d1c57ab3f43e573b5a5f532e6afd348d81308fe66a1ecb691548e/pybullet-2.7.1-cp36-cp36m-manylinux1_x86_64.whl (95.0MB)
[K     |████████████████████████████████| 95.0MB 55kB/s 
[?25hInstalling collected packages: pybullet
Successfully installed pybullet-2.7.1


# Import the libraries 

1.Pybullet is an environment which is a plugin for gym which allows us to do some sort of 3d animation 

2.Wrapper is used to wrap the pybullet



In [0]:
import os
import math 
import numpy as np
import matplotlib.pyplot as plt
import torch
import gym
import pybullet_envs
import torch.nn as nn
from gym import wrappers
import torch.nn.functional as F
from collections import deque
from torch.autograd import Variable

## Step 1


1.We initialize the Experience Replay Memory with the size of 1e6.

2. Then we populate it with new transitions


In [0]:
# Defines the ReplayBuffer Class so that we can create objects of this class 
class ReplayBuffer(object):
# Initialization Class 
  def __init__(self,max_size=1e6):
    self.storage=[]
    self.max_size=max_size
    self.ptr=0
#***********************************************************************************************************************
# This function is the add transition function which adds the trasnsitions to the Replaybuffer
#     1. If the length of the storage is equal to the maximum size defined for the replay buffer then the pointer
#        is reset to the first location and starts overwritting the new records with old saved ones sequentially
#     2. if not then the transactions are appended to the storage (Replay buffer)
#***********************************************************************************************************************
  def add(self,transition):
    if len(self.storage)== self.max_size:
      self.storage[int(self.ptr)]=transition
      self.ptr=(self.ptr+1) % self.max_size
    else:
      self.storage.append(transition)  
#**********************************************************************************************************************
# Ths function returns a random subset of Replay memory based on the given batch size 
#    1.Define empty lists to store current state, next state , actions , rewards and done 
#    2.Read each of the above sequentially from the stoarge 
#    3. Append the fetched data into the respective lists 
#    4.Return the data to the calling function 
#**********************************************************************************************************************
  def sample(self,batch_size):
    ind=np.random.randint(0,len(self.storage),batch_size)
    batch_state,batch_next_state,batch_action,batch_reward,batch_dones=[],[],[],[],[]
    for i in ind:
      state,next_state,action,reward,done=self.storage[i]
      batch_state.append(np.array(state,copy=False))
      batch_next_state.append(np.append(next_state,copy=False))
      batch_action.append(np.append(action,copy=False))
      batch_reward.append(np.append(reward,copy=False))
      batch_dones.append(np.append(done,copy=False))
    return np.array(batch_state),np.array(batch_next_state),np.array(batch_action),np.array(batch_reward).reshape(-1,1),np.array(batch_dones).reshape(-1,1)



## Step 2

Build a Model for Actor MOdel and Actor Target. As we are building the same model for both hence the definition are also the same 

In [0]:
#**************************************************************************************************************************
#It takes the input as the 
1. State dims  - State parameters 
2. action_dims - How many actions can be taken
3. max_action  - the limit that each action can take (for example 5 degree, 10 degree etc.)
#**************************************************************************************************************************

class Actor(nn.Module):
  def __init__(self,state_dims,action_dims,max_action):
    super(Actor,self).__init__()
    self.layer1=nn.Linear(state_dims,400)
    self.layer2=nn.Linear(400,300)
    self.layer3=nn.Linear(300,action_dims)
    self.max_action=max_action
#**************************************************************************************************************************
# The point here is to see the third line which takes the x value and apply the tanh function to keep it between 
# -1 and +1 and then multiplying that with the max action which helps them to take exactly the same angle,value 
#**************************************************************************************************************************
  def forward(self,x):
    x=F.relu(self.layer1(x))
    x=F.relu(self.layer2(x))
    x=self.max_action*torch.tanh(self.layer3(x))
    return x

## Step 3

We are creating 2 Critic models here 

In [0]:
class Critic(nn.Module):
    
#**************************************************************************************************************************
# It takes the input as the 
# 1. State dims  - State parameters 
# 2. action_dims - How many actions can be taken
# 3. max_action  - Not required as it's going to come from 
# We have 2 critic here as the dimensions are different
#**************************************************************************************************************************    
  def __init__(self,state_dims,action_dims):
    super(Critic,self).__init__()
# First Critic Network
    self.layer1=nn.Linear(state_dims+action_dims,400)
    self.layer2=nn.Linear(400,300)
    self.layer3=nn.Linear(300,action_dims)
# Second Critic Network
    self.layer4=nn.Linear(state_dims+action_dims,400)
    self.layer5=nn.Linear(400,300)
    self.layer6=nn.Linear(300,action_dims)

  def forward(self,x,u):     # x - state , u-action 
    xu=torch.cat([x,u],1)    # 1 for vertical concatenation #0 for Horizontal concatenation
    
# Forward propagation for first critic
    xu=F.relu(self.layer1(xu))
    x1=F.relu(self.layer2(x1))
    x1=self.layer3(x1)
    
# Forward propagation for second critic
    x2=F.relu(self.layer4(xu))
    x2=F.relu(self.layer5(x2))
    x2=self.layer6(x2)
    return x1,x2

# **********************************************************************************************************
# Now let's look at the below network, Here we are planning to take the first critic to train the actor
# It depends on the programmer , we can take critic2 or even the average . It doesn't matter in long run 
# **********************************************************************************************************


  def Q1(self,x,u):        # x - state , u-action , This is used for updating Q values 
    xu=torch.cat([x,u],1)  # 1 for vertical concatenation #0 for Horizontal concatenation

    xu=F.relu(self.layer1(xu))
    x1=F.relu(self.layer2(x1))
    x1=self.layer3(x1)
    return x1

# Training Process . 
Create a T3D class, initialize variables and get ready for step 4

In [0]:
# select the device (CPU or GPU)
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Building the whole training process into a class
class T3D(object):
  #making sure our T3D class can work with any environment
#**************************************************************************************************************************
# 1. State dims  - State parameters  : exactly where you are, what the car velocity , accelation ,distance travelled etc
# 2. action_dims - How many actions can be taken
# 3. max_action  - Not required as it's going to come from 
#
#**************************************************************************************************************************
  def __init__(self,state_dims,action_dims,max_action):
        # Actor model we are defining and sending it to cuda
    self.actor=Actor(state_dims,action_dims,max_action).to(device)
    self.actor_target=Actor(state_dims,action_dims,max_action).to(device) # polyak averaging 
    
    # load the target weights to the state dictionary. First time the weights of actor model and actor target are going to 
    # be the same 
        self.actor_target.load_state_dict(self.actor.state_dict)
    
    # initializing with model weights to keep them same , Optimizer is Adam here 
    self.actor.optimizer=torch.optim.Adam(self.actor.parameters())

    self.critic=Critic(state_dims,action_dims).to(device)            #GD
    self.critic_target=Critic(state_dims,action_dims).to(device)     # Polyak averaging 
    self.critic_target.load_state_dict(self.critic.state_dict)
    
    # initializing with model weights to keep them same 
    self.critic.optimizer=torch.optim.Adam(self.critic.parameters())
    self.max_action=max_action

#*********************************************************************************************************
# Reshape the state variable (GPU)
# call the actor forward function using the state variable , sent that to cpu, extract the data ,convert 
# to numpy and flatten the output 
#*********************************************************************************************************
  def select_action(self,state):
    state=torch.Tensor(state.reshape(1,-1)).to(device)
    return self.actor(state).cpu().data.numpy().flatten()

#STEP 4

#*********************************************************************************************************
'''
inputs to the function 
------------------------
1. replay buffer object
2. how many iterations should this network run 
3. batch size 
4. discount factor for bellman equation 
5. tau - for polyak averaging 
6. policy noise - the noise to add to our action (gaussian noise)
7. noise clip - maximum allowable action  
8. policy frequency - how often we are going to update our actor 

for each iteration
1. Select sample data from replay buffer 
2. send each value to the GPU
'''
#*********************************************************************************************************

  def train(self,replay_buffer,iterations,batch_size=100,discount=0.99,tau=0.005,policy_noise=0.2,noise_clip=0.5,policy_freq=2):
    for it in range(iterations):
      batch_state,batch_next_state,batch_action,batch_reward,batch_dones=replay_buffer.sample(batch_size)
      state=torch.Tensor(batch_state).to(device)
      next_state=torch.Tensor(batch_next_state).to(device)
      action=torch.Tensor(batch_action).to(device)
      reward=torch.Tensor(batch_reward).to(device)
      done=torch.Tensor(batch_dones).to(device)

#From the next state s', the actor target plays the next action a'. This is required as the next state and 
# action goes for critic. But before we send it to critic we need to add gaussian noise 
#STEP 5
    next_action=self.actor_target.forward(next_state)
    
#************************************************************************************************************************
# We add gaussian noise to this next action a' and we clamp it in a range of values supported by the environment
# We get batch action from the replay buffer, get the data and add noise to it (0 - mean , policy_noise is the deviation ) 
# clamp the noise with min and max value 
# So next action is next action + noise which is clamped with the max action value from -ve to +ve
#************************************************************************************************************************

#STEP 6

    noise=torch.Tensor(batch_action).data.normal_(0,policy_noise).to(device)
    noise=noise.clamp(-noise_clip,noise_clip)
    next_action=(next_action+noise).clamp(-self.max_action,self.max_action)
    

#STEP 7
#************************************************************************************************************************
#  Now the two critic targets take each the tuple (s',a') as input and return 
#  two Q values , Qt1(s',a') and Qt2(s',a') as outputs
#************************************************************************************************************************

    target_Q1, target_Q2 = self.critic_target(next_state, next_action)
#STEP 8
# Keep the minimum of the target Q values 

    target_Q = torch.min(target_Q1, target_Q2)
    
#STEP 9
#************************************************************************************************************************
'''
We get the final target of the two critic models, which is 
 Qt = r + gamma * min(Qt1,Qt2)
 we can define 
 target_q =  reward + discount * torch.min(Qt1,Qt2)
but it won't work 
First, we are only supposed to run this if the episode is over , which means we need to integrate Done 
Second , target_q would create it's BP/communication graph, and without detaching Qt1/Qt2 from their own graph ,
We are complicating things, i:e we need to use detach.
'''
#************************************************************************************************************************
'''
# target_Q = reward + (1-done) * discount * target_Q
# 0 = episode not over , 1 - episode over 
# We can't run the above equation efficiently as some components are in computational graphs and some are not.
# So we need to make one minor modifications 
# As target_Q1 and target_Q2 are from different computation chaims/maps. Hence it's very much required to detach 
# before we compute the target_Q again . 
'''
#************************************************************************************************************************

    target_Q = reward + ((1-done) *discount * target_Q).detach()

# STEP 10
#

#Two critic models take (s,a) and return the two Q values 

    current_Q1, current_Q2 = self.critic(state, action)

# STEP 11 - We compute the loss coming from the two critic models. The critic loss is the sum of both crictic loss1 and 
# critic loss 2 

    critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)

# STEP 12 Backpropagate this critic loss and update the parameters of two critic models 
# models with a Adam optimizer  

    self.critic_optimizer.zero_grad()    # initialize the gradients to zero
    critic_loss.backward()               # computing the gradients 
    self.critic_optimizer.step()         # performing the weight updates 
    
# STEP 13
# Once every 2 iterations , we update our Actor model by performing gradient ASCENT on the output of the first Critic model
'''
This is how it happens .
1. Actor take the next state and predicts the next action 
2. The next state and next action goes to both the critic target models 
3. Then we need to take the minimum of the above
4. The minimum goes to both the critic models 
5. Then it's going to minimize the loss 
The process from 1-5 repeats twice and then 
6. it updates the actor model once 

Now this whole process 1-6 runs twice beofre it do the polyak averaging for the critic targets. That means the updates 
would have happened 4 times 

'''
    # Once every two iterations , we update our Actor model by performing gradient ascent on the output 
    # of the first Critic model 
    
    if it % policy_freq==0:
      # this is DPG part  
     # the Actor takes state gives action , the state and action is given to critic, then it takes the mean  
      actor_loss = -self.critic.Q1(state, self.actor(state)).mean()  
      self.actor_optimizer.zero_grad()      # Optimizer 
      actor_loss.backward()                 # Backpropagation
      self.actor_optimizer.step()           # Optimizes the step 
        
        
#STEP 14
#********************************************************************************************************************
# Still , in once every two iterations , we update our Actor Target by polyak Averaging 
# The above model should run twice that means the critic model would have updated 4 times 
# For every combination of actor parameter and actor target parameter we take the param.data and target.param.data 
# and perform polyak averaging to update target_param data 
#********************************************************************************************************************

   for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
            target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)
            
            
# STEP 15
#********************************************************************************************************************
# Still , in once every two iterations , we update our Critic Target by polyak Averaging 
# For every combination of critic parameter and critic target parameter we take the param.data and target.param.data 
# and perform polyak averaging to update target_param data 
#********************************************************************************************************************

   for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
            target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)




    


