# Implementation of DDGP to solve "Lunar Lander" Problem (continuous version)

We begin by defining useful function and import relevant modules

In [None]:
import pandas as pd
from collections import deque, namedtuple
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
# Load packages
import numpy as np
import gym
import torch
import matplotlib.pyplot as plt
from tqdm import trange
from DDPG_soft_updates import soft_updates


def running_average(x, N):
    ''' Function used to compute the running average
        of the last N elements of a vector x
    '''
    if len(x) >= N:
        y = np.copy(x)
        y[N-1:] = np.convolve(x, np.ones((N, )) / N, mode='valid')
    else:
        y = np.zeros_like(x)
    return y


def soft_updates(network: nn.Module,
                 target_network: nn.Module,
                 tau: float) -> nn.Module:
    """ Performs a soft copy of the network's parameters to the target
        network's parameter

        Args:
            network (nn.Module): neural network from which we want to copy the
                parameters
            target_network (nn.Module): network that is being updated
            tau (float): time constant that defines the update speed in (0,1)

        Returns:
            target_network (nn.Module): the target network

    """
    tgt_state = target_network.state_dict()
    for k, v in network.state_dict().items():
        tgt_state[k] = (1 - tau)  * tgt_state[k]  + tau * v
    target_network.load_state_dict(tgt_state)
    return target_network


Here we define our Neural Networks structures and the replay buffer class.

In [None]:
import torch.nn as nn
Experience = namedtuple('Experience',
                        ['state', 'action', 'reward', 'next_state', 'done'])



class RandomAgent(Agent):
    ''' Agent taking actions uniformly at random, child of the class Agent'''
    def __init__(self, n_actions: int):
        super(RandomAgent, self).__init__(n_actions)

    def forward(self, state: np.ndarray) -> np.ndarray:
        ''' Compute a random action in [-1, 1]

            Returns:
                action (np.ndarray): array of float values containing the
                    action. The dimensionality is equal to self.n_actions from
                    the parent class Agent.
        '''
        return np.clip(-1 + 2 * np.random.rand(self.n_actions), -1, 1)
    
class ExperienceReplayBuffer(object):
    """ Class used to store a buffer containing experiences of the RL agent.
    """
    def __init__(self, maximum_length):
        # Create buffer of maximum length
        self.buffer = deque(maxlen=maximum_length)

    def append(self, experience):
        # Append experience to the buffer
        self.buffer.append(experience)

    def __len__(self):
        # overload len operator
        return len(self.buffer)

    def sample_batch(self, n):
        """ Function used to sample experiences from the buffer.
            returns 5 lists, each of size n. Returns a list of state, actions,
            rewards, next states and done variables.
        """
        # If we try to sample more elements that what are available from the
        # buffer we raise an error
        if n > len(self.buffer):
            raise IndexError('Tried to sample too many elements from the buffer!')

        # Sample without replacement the indices of the experiences
        # np.random.choice takes 3 parameters: number of elements of the buffer,
        # number of elements to sample and replacement.
        indices = np.random.choice(
            len(self.buffer),
            size=n,
            replace=False
        )

        # Using the indices that we just sampled build a list of chosen experiences
        batch = [self.buffer[i] for i in indices]

        # batch is a list of size n, where each element is an Experience tuple
        # of 5 elements. To convert a list of tuples into
        # a tuple of list we do zip(*batch). In this case this will return a
        # tuple of 5 elements where each element is a list of n elements.
        return zip(*batch)

### Neural Network ###
class actor_net(nn.Module):
    """ Create a feedforward neural network """
    def __init__(self, input_size, output_size):
        super().__init__()

        # Create input layer with ReLU activation
        self.input_layer = nn.Linear(input_size, 400)
        self.input_layer_activation = nn.ReLU()
        
        self.input_layer1 = nn.Linear(400, 200)
        
        self.output_layer= nn.Linear(200, output_size)
        
        self.output_layer_activation = nn.Tanh()

        
    def forward(self, x):
        # Function used to compute the forward pass

        # Compute first layer
        l1 = self.input_layer(x)
        l1 = self.input_layer_activation(l1)
        l1 = self.input_layer1(l1)
        l1 = self.input_layer_activation(l1)
        l1= self.output_layer(l1)
        out= self.output_layer_activation(l1)
        
        
        return out
    
class critic_net(nn.Module):
    """ Create a feedforward neural network """
    def __init__(self, input_size, output_size, m):
        super().__init__()

        # Create input layer with ReLU activation
        self.input_layer = nn.Linear(input_size, 400)
        self.input_layer_activation = nn.ReLU()
        
        self.input_layer1 = nn.Linear(400+m, 200)
        self.output_layer= nn.Linear(200, output_size)
        
        

        
    def forward(self, x, a):
        # Function used to compute the forward pass

        # Compute first layer
        l1 = self.input_layer(x)
        l1 = self.input_layer_activation(l1)
        hidden= torch.cat ([ l1 , a ] , dim =1)
        l1 = self.input_layer1(hidden)
        l1 = self.input_layer_activation(l1)

        out= self.output_layer(l1)
        
        
        
        return     
    

Here we fix our parameters for training and we implement it.

In [None]:

# Import and initialize Mountain Car Environment
env = gym.make('LunarLanderContinuous-v2')
env.reset()

# Parameters
N_episodes = 300              # Number of episodes to run for training
discount_factor = 0.99         # Value of gamma
n_ep_running_average = 50      # Running average of 50 episodes
n_actions = len(env.action_space.high)               # Number of available actions
dim_state = len(env.observation_space.high)  # State dimensionality
mu=0.15
sigma=0.2
lr_actor=5e-5
lr_critic=5e-4
L=50000
tau=1e-3
N=64
d=2
max_norm=1

# Reward
episode_reward_list = []  # Used to save episodes reward
episode_number_of_steps = []

### Create Experience replay buffer ###
buffer = ExperienceReplayBuffer(maximum_length=L)

### Filling up Buffer with Random experiences
agent = RandomAgent(n_actions)

for i in range(L):
    # Reset enviroment data and initialize variables
    done = False
    state = env.reset()

    while not done:
        # Take a random action
        action = agent.forward(state)

        # Get next state and reward.  The done variable
        # will be True if you reached the goal position,
        # False otherwise
        next_state, reward, done, _ = env.step(action)

        exp = Experience(state, action, reward, next_state, done)

        buffer.append(exp)

        # Update state for next iteration
        state = next_state

    # Close environment
    env.close()


### Create network ###

actor = actor_net(input_size= dim_state, output_size=n_actions)
critic = critic_net(input_size= dim_state, output_size=1 , m=n_actions)


target_actor = actor_net(input_size= dim_state, output_size=n_actions)
target_critic = critic_net(input_size= dim_state, output_size=1 , m=n_actions)


### Create optimizer ###
optimizer_actor = optim.Adam(actor.parameters(), lr=lr_actor)
optimizer_critic = optim.Adam(critic.parameters(), lr=lr_critic)

### PLAY ENVIRONMENT ###
# The next while loop plays 5 episode of the environment



max_score= -1000
avg= -1000
#Steps to update target network
steps=0

# Training process
EPISODES = trange(N_episodes, desc='Episode: ', leave=True)

for episode in EPISODES:

    total_episode_reward=0
    state = env.reset()                    # Reset environment, returns
    noise=0                                       # initial state
    done = False                           # Boolean variable used to indicate

    t=0
    while not done:
        #env.render()                       # Render the environment, remove this
                                           # line if you run on Google Colab
        # Create state tensor, remember to use single precision (torch.float32)
        
        with torch.no_grad():
            
        
            #Ornstein-Uhlenbeck process as a noise
            noise= -mu*noise+ np.random.normal(0, sigma, n_actions)
            
            

            noise_tensor= torch.tensor(noise,
                                        requires_grad=False,
                                        dtype=torch.float32)

            
            actor.eval()
            #Take perturbated action using actor network
            action = actor(torch.tensor(state,
                                requires_grad=False,
                                dtype=torch.float32)) 
            
            action= action + noise_tensor

            action= np.clip(action, -1, 1)

        

        # The next line takes permits you to take an action in the RL environment
        # env.step(action) returns 4 variables:
        # (1) next state; (2) reward; (3) done variable; (4) additional stuff
        next_state, reward, done, _ = env.step(action.detach().numpy())

        # Update episode reward
        total_episode_reward += reward

        # Append experience to the buffer
        exp = Experience(state, action, reward, next_state, done)

        buffer.append(exp)
        
        
        # Update state for next iteration
        state = next_state

        
        
    ### TRAINING ###
        # Perform training only if we have more than N elements in the buffer
        if len(buffer) >= N:
            
            
            # Sample a batch of N elements
            states, actions, rewards, next_states, dones = buffer.sample_batch(n=N)
            
            target_critic.eval()
            target_actor.eval()
            actor.train()
            critic.train()
            with torch.no_grad():
                #target values
                next_s=torch.tensor(next_states,
                                requires_grad=False,
                                dtype=torch.float32)

                #Compute actor policy results
                input_targets=target_actor(next_s)


                #target values
                target_values=target_critic(next_s, input_targets)


                rewards= torch.tensor(rewards, requires_grad=False,dtype=torch.float32)
                rewards=torch.reshape(rewards, target_values.shape)

                dones=  torch.tensor(dones, requires_grad=False,dtype=torch.float32)
                dones=torch.reshape(dones, target_values.shape)

                #Compute target values
                target_values= rewards + discount_factor * (1-dones) * (target_values)

            #concatenated input values
            
            s= torch.tensor(states,
                            requires_grad=True,
                            dtype=torch.float32)
            
            actions_list=[]
            for a in actions:
                actions_list.append(list(a))
            
            a= torch.tensor(actions_list,
                            requires_grad=False,
                            dtype=torch.float32)
            
            
            #Compute output using actor network
            values= critic(s, a)
            
            # Compute loss function
            loss_critic = nn.functional.mse_loss(
                            values, target_values.detach())
            
            
            optimizer_critic.zero_grad()
            
            # Compute gradient
            loss_critic.backward()

            # Clip gradient norm to 1
            nn.utils.clip_grad_norm_(critic.parameters(), max_norm=max_norm)

            # Perform backward pass (backpropagation)
            optimizer_critic.step()

            

            if t%d==0:
                
                
                actor_input= torch.tensor(states,
                            requires_grad=True,
                            dtype=torch.float32)
                
                actor_output=actor(actor_input)
                
                
                states_actor= torch.tensor(states,
                            requires_grad=False,
                            dtype=torch.float32)
                
                
                results_actor= critic(states_actor.detach(), actor_output)
                
                loss_actor= -torch.mean(results_actor)
                
                
                optimizer_actor.zero_grad()

                loss_actor.backward()
                
                nn.utils.clip_grad_norm_(actor.parameters(), max_norm=max_norm)
                
                optimizer_actor.step()
                
                target_critic= soft_updates(critic, target_critic, tau)
                target_actor= soft_updates(actor, target_actor, tau)
                
                         

        t+=1
    
    #Add rewards and number of steps
    episode_reward_list.append(total_episode_reward)
    episode_number_of_steps.append(t)
    EPISODES.set_description(
        "Episode {} - Reward/Steps: {:.1f}/{} - Avg. Reward/Steps: {:.1f}/{}".format(
        episode, total_episode_reward, t,
        running_average(episode_reward_list, n_ep_running_average)[-1],
        running_average(episode_number_of_steps, n_ep_running_average)[-1]))

    



# Close all the windows
env.close()


# Plot Rewards and steps
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(16, 9))
ax[0].plot([i for i in range(1, N_episodes+1)], episode_reward_list, label='Episode reward')
ax[0].plot([i for i in range(1, N_episodes+1)], running_average(
    episode_reward_list, n_ep_running_average), label='Avg. episode reward')
ax[0].set_xlabel('Episodes')
ax[0].set_ylabel('Total reward')
ax[0].set_title('Total Reward vs Episodes')
ax[0].legend()
ax[0].grid(alpha=0.3)

ax[1].plot([i for i in range(1, N_episodes+1)], episode_number_of_steps, label='Steps per episode')
ax[1].plot([i for i in range(1, N_episodes+1)], running_average(
    episode_number_of_steps, n_ep_running_average), label='Avg. number of steps per episode')
ax[1].set_xlabel('Episodes')
ax[1].set_ylabel('Total number of steps')
ax[1].set_title('Total number of steps vs Episodes')
ax[1].legend()
ax[1].grid(alpha=0.3)
plt.show()

# Check Solution:

Here we check our solution for the problem

In [None]:

# Load packages
import numpy as np
import gym
import torch
from tqdm import trange

def running_average(x, N):
    ''' Function used to compute the running average
        of the last N elements of a vector x
    '''
    if len(x) >= N:
        y = np.copy(x)
        y[N-1:] = np.convolve(x, np.ones((N, )) / N, mode='valid')
    else:
        y = np.zeros_like(x)
    return y

# Load model
try:
    model = actor
    print('Network model: {}'.format(model))
except:
    print('File neural-network-2-actor.pth not found!')
    exit(-1)

# Import and initialize Mountain Car Environment
env = gym.make('LunarLanderContinuous-v2')
env.reset()

# Parameters
N_EPISODES = 50            # Number of episodes to run for trainings
CONFIDENCE_PASS = 125

# Reward
episode_reward_list = []  # Used to store episodes reward

# Simulate episodes
print('Checking solution...')
EPISODES = trange(N_EPISODES, desc='Episode: ', leave=True)
for i in EPISODES:
    EPISODES.set_description("Episode {}".format(i))
    # Reset enviroment data
    done = False
    state = env.reset()
    total_episode_reward = 0.
    while not done:
        # Get next state and reward.  The done variable
        # will be True if you reached the goal position,
        # False otherwise
        action = model(torch.tensor([state]))[0]
        next_state, reward, done, _ = env.step(action.detach().numpy())

        # Update episode reward
        total_episode_reward += reward

        # Update state for next iteration
        state = next_state

    # Append episode reward
    episode_reward_list.append(total_episode_reward)

    # Close environment
    env.close()

avg_reward = np.mean(episode_reward_list)
confidence = np.std(episode_reward_list) * 1.96 / np.sqrt(N_EPISODES)


print('Policy achieves an average total reward of {:.1f} +/- {:.1f} with confidence 95%.'.format(
                avg_reward,
                confidence))

if avg_reward - confidence >= CONFIDENCE_PASS:
    print('Your policy passed the test!')
else:
    print("Your policy did not pass the test! The average reward of your policy needs to be greater than {} with 95% confidence".format(CONFIDENCE_PASS))
