## Installing the packages

In [1]:
!pip install pybullet



## Importing the libraries

In [7]:
import os
import random
import pybullet_envs
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

## Step 1: Initialze the Experience Replay Memory

For the first episode we will populate the Experience Replay Memory with random actions.  
Then in each subsequent training episode we will leverage some instances of the Experience Replay Memory to account for anomolous (state, action) pairs.

In [3]:
class ReplayBuffer(object):
    def __init__(self, max_size=1e6):
        self.storage = []
        self.max_size = max_size
        self.ptr = 0
    def add(self, transition):
        if len (self.storage) == self.max_size:
            self.storage[int(self.ptr)] = transition
            self.ptr = (self.ptr + 1)% self.max_size
        else:
            self.storage.append(transition)
            
    def sample(self, batch_size):
        ind = np.random.randint(0, len(self.storage), size = batch_size)
        batch_states, batch_next_states, batch_actions, \
        batch_rewards, batch_dones = [], [], [], [], []
        for i in ind:
            state, next_state, action, reward, done = self.storage[i]
            batch_states.append(np.array(state, copy=False))
            batch_next_states.append(np.array(action, copy=False))
            batch_rewards.append(np.array(done, copy=False))
        return np.array(batch_states), np.array(batch_next_states),\
        np.array(batch_actions), np.array(batch_rewards).reshape(-1,1),\
        np.array(batch_dones).reshape(-1,1)

## Step 2: Build Neural Network for Actor Model & Actor Target

An Actor Class is created for easy instantiation during Training.
To ensure we have a continuous action-space, as opposed to a discrete action space, we map the result of the *tanh() function* to the maximum allowed action-value, ie;  **self.max_action**

In [6]:
class Actor(nn.Module):
    def __init__(self, state_dim, action_dim, max_action):
        super(Actor, self).__init__() # To activate inheritence
        self.layer_1 = nn.Linear(state_dim, 400)
        self.layer_2 = nn.Linear(400,300)
        self.layer_3 = nn.Linear(300, action_dim)
        self.max_action = max_action

    def forward(self, x): ##Forward Propagation
        x = F.relu(self.layer_1(x))    # F = Functional Module
        x = F.relu(self.layer_2(x))
        x = self.max_action *torch.tanh(self.layer_3(x))
        return x

## Step 3: Build Neural Network for Critic Models & Critic Targets

We add the second forward method to obtain the gradient ascent forward propagation of the first critic neural network

In [9]:
class Critic(nn.Module):

    def __init__(self, state_dim, action_dim):
        super(Critic, self).__init__()
        #Defining the first Critic Neural Network
        self.layer_1 = nn.Linear(state_dim + action_dim, 400)
        self.layer_2 = nn.Linear(400, 300)
        self.layer_3 = nn.Linear(300, 1)

        #Defining the second Critic Neural Network
        self.layer_4 = nn.Linear(state_dim + action_dim, 400)
        self.layer_5 = nn.Linear(400, 300)
        self.layer_6 = nn.Linear(300, 1)

    def forward(self, x, u):
        xu = torch.cat([x, u], 1)
        # Forward-Propagation on the first Critic Neural Network
        x1 = F.relu(self.layer_1(xu))
        x1 = F.relu(self.layer_2(x1))
        x1 = self.layer_3(x1)
        # Forward-Propagation on the second Critic Neural Network
        x2 = F.relu(self.layer_4(xu))
        x2 = F.relu(self.layer_5(x2))
        x2 = self.layer_6(x2)
        return x1, x2
    
    def Q1(self, x, u):
        xu = torch.cat([x, u], 1)
        x1 = F.relu(self.layer_1(xu))
        x1 = F.relu(self.layer_2(x))
        x1 = self.layer_3(x)
        return x1


## Step 4 to 15: Training Process

We run a full episode with first 10,000 actions played randomly to facilitate exploration. Then play actions played by the Actor model. Only after that do we start to sample from the Experience Replay Memory

In [None]:
#Selecting the device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


#Training Process Class
class TD3(object):
    def __init__(self, )