# DQN Implementation to solve "Lunar Lander" Discrete version

In [None]:
import pandas as pd
from collections import deque, namedtuple
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
# Load packages
import numpy as np
import gym
import torch
import matplotlib.pyplot as plt
from tqdm import trange
import random

def running_average(x, N):
    ''' Function used to compute the running average
        of the last N elements of a vector x
    '''
    if len(x) >= N:
        y = np.copy(x)
        y[N-1:] = np.convolve(x, np.ones((N, )) / N, mode='valid')
    else:
        y = np.zeros_like(x)
    return y


class RandomAgent(Agent):
    ''' Agent taking actions uniformly at random, child of the class Agent'''
    def __init__(self, n_actions: int):
        super(RandomAgent, self).__init__(n_actions)

    def forward(self, state: np.ndarray) -> int:
        ''' Compute an action uniformly at random across n_actions possible
            choices

            Returns:
                action (int): the random action
        '''
        self.last_action = np.random.randint(0, self.n_actions)
        return self.last_action



Below, we define useful classes

In [None]:
Experience = namedtuple('Experience',
                        ['state', 'action', 'reward', 'next_state', 'done'])


class RandomAgent(Agent):
    ''' Agent taking actions uniformly at random, child of the class Agent'''
    def __init__(self, n_actions: int):
        super(RandomAgent, self).__init__(n_actions)

    def forward(self, state: np.ndarray) -> int:
        ''' Compute an action uniformly at random across n_actions possible
            choices

            Returns:
                action (int): the random action
        '''
        self.last_action = np.random.randint(0, self.n_actions)
        return self.last_action

    
    
class ExperienceReplayBuffer(object):
    """ Class used to store a buffer containing experiences of the RL agent.
    """
    def __init__(self, maximum_length):
        # Create buffer of maximum length
        self.buffer = deque(maxlen=maximum_length)

    def append(self, experience):
        # Append experience to the buffer
        self.buffer.append(experience)

    def __len__(self):
        # overload len operator
        return len(self.buffer)

    def sample_batch(self, n):
        """ Function used to sample experiences from the buffer.
            returns 5 lists, each of size n. Returns a list of state, actions,
            rewards, next states and done variables.
        """
        # If we try to sample more elements that what are available from the
        # buffer we raise an error
        if n > len(self.buffer):
            raise IndexError('Tried to sample too many elements from the buffer!')

        # Sample without replacement the indices of the experiences
        # np.random.choice takes 3 parameters: number of elements of the buffer,
        # number of elements to sample and replacement.
        indices = np.random.choice(
            len(self.buffer),
            size=n,
            replace=False
        )

        # Using the indices that we just sampled build a list of chosen experiences
        batch = [self.buffer[i] for i in indices]

        # batch is a list of size n, where each element is an Experience tuple
        # of 5 elements. To convert a list of tuples into
        # a tuple of list we do zip(*batch). In this case this will return a
        # tuple of 5 elements where each element is a list of n elements.
        return zip(*batch)

### Neural Network ###
class Agent(nn.Module):
    """ Create a feedforward neural network """
    def __init__(self, input_size, output_size, hidden_size=124):
        super().__init__()

        # Create input layer with ReLU activation
        self.input_layer = nn.Linear(input_size,hidden_size)
        self.input_layer_activation = nn.ReLU()
        self.input_layer1 = nn.Linear(hidden_size, hidden_size)
        self.output_layer= nn.Linear(hidden_size, output_size)
        
        

        # Create output layer
        #self.output_layer = nn.Linear(8, output_size)

    def forward(self, x):
        # Function used to compute the forward pass

        # Compute first layer
        l1 = self.input_layer(x)
        l1 = self.input_layer_activation(l1)
        #l1 = self.input_layer1(l1)
        #l1 = self.input_layer_activation(l1)
        out= self.output_layer(l1)
        
        return out


    
def epsilon_decay(episode,Z, eps_max=0.99, eps_min=0.05, method="linear"):
    
    if method=='linear':
        
        eps= eps_max-((eps_max-eps_min)*(episode-1))/(Z-1)
        
        return max(eps_min, eps)
    
    if method=="exp":
        eps= eps_max* ((eps_min/eps_max)**((episode-1)/(Z-1)))
        
        return max(eps_min, eps)
    
    



Below we implement and execute the training

In [None]:
best_model= Agent(input_size=dim_state, output_size=n_actions)
torch.save(best_model.state_dict(), "best_model.pt")

In [None]:
### CREATE RL ENVIRONMENT ###
env = gym.make('LunarLander-v2')
env.reset()


# Parameters
N_episodes = 620  # Number of episodes
discount_factor = 1.0  # Value of the discount factor
n_ep_running_average = 50  # Running average of 50 episodes
n_actions = env.action_space.n  # Number of available actions
dim_state = len(env.observation_space.high)  # State dimensionality
L = 29025  # Size of the Buffer
N = 38  # Size of batch
C = int(L / N)  # Frequency of updates
# We will use these variables to compute the average episodic reward and
# the average number of steps per episode


hidden_size = 124
method = "linear"
lr = 0.00006
max_norm = 1.846938775510204

episode_reward_list = []  # this list contains the total reward per episode
episode_number_of_steps = []  # this list contains the number of steps per episode

### Create Experience replay buffer ###
buffer = ExperienceReplayBuffer(maximum_length=L)

best_model= Agent(input_size=dim_state, output_size=n_actions, hidden_size=hidden_size)
torch.save(best_model.state_dict(), "best_model.pth")

### Filling up Buffer with Random experiences
agent = RandomAgent(n_actions)

for i in range(N):
    # Reset enviroment data and initialize variables
    done = False
    state = env.reset()

    while not done:
        # Take a random action
        action = agent.forward(state)

        # Get next state and reward.  The done variable
        # will be True if you reached the goal position,
        # False otherwise
        next_state, reward, done, _ = env.step(action)

        exp = Experience(state, action, reward, next_state, done)

        buffer.append(exp)

        # Update state for next iteration
        state = next_state

    # Close environment
    env.close()

### Create network ###
network = Agent(input_size=dim_state, output_size=n_actions, hidden_size=hidden_size)

target_network = Agent(input_size=dim_state, output_size=n_actions, hidden_size=hidden_size)
target_network.load_state_dict(network.state_dict())

### Create optimizer ###
optimizer = optim.Adam(network.parameters(), lr=lr)

### PLAY ENVIRONMENT ###
# The next while loop plays 5 episode of the environment


max_score = -1000
avg = -1000
# Steps to update target network
steps = 0

EPISODES = trange(N_episodes)

for episode in EPISODES:

    total_episode_reward = 0
    state = env.reset()  # Reset environment, returns
    # initial state
    done = False  # Boolean variable used to indicate

    # define greedy epsilon
    Z = int(0.93 * N_episodes)

    epsilon = epsilon_decay(episode, Z, eps_max=0.9, eps_min=0.05, method=method)

    # if an episode terminated

    t = 0
    while not done:
        # env.render()                       # Render the environment, remove this
        # line if you run on Google Colab
        # Create state tensor, remember to use single precision (torch.float32)
        state_tensor = torch.tensor([state],
                                    requires_grad=False,
                                    dtype=torch.float32)

        explore = random.random() < epsilon

        if explore:
            action = np.random.randint(0, n_actions)


        else:
            network.eval()
            # Compute output of the network
            with torch.no_grad():
                values = network(state_tensor)
                action = values.max(1)[1].item()

        # The next line takes permits you to take an action in the RL environment
        # env.step(action) returns 4 variables:
        # (1) next state; (2) reward; (3) done variable; (4) additional stuff
        next_state, reward, done, _ = env.step(action)

        # Update episode reward
        total_episode_reward += reward

        # Append experience to the buffer
        exp = Experience(state, action, reward, next_state, done)

        buffer.append(exp)

        steps += 1

        # Update state for next iteration
        state = next_state

        ### TRAINING ###
        # Perform training only if we have more than 3 elements in the buffer
        if len(buffer) >= N:
            network.train()
            # Sample a batch of N elements
            states, actions, rewards, next_states, dones = buffer.sample_batch(n=N)

            # Compute output of the network given the states batch
            values = network(torch.tensor(states,
                                          requires_grad=True,
                                          dtype=torch.float32))

            actions = torch.tensor(actions,
                                   requires_grad=False,
                                   dtype=torch.int64)
            action_masks = F.one_hot(actions, n_actions)

            values = (action_masks * values).sum(dim=-1)  # Computing Q(s,a)

            # target values
            target_values = target_network(torch.tensor(next_states,
                                                        requires_grad=False,
                                                        dtype=torch.float32))

            target_values = target_values.max(1)[0]  # max Q(si,a) for all the batch when target network is used

            rewards = torch.tensor(rewards, requires_grad=False, dtype=torch.float32)

            dones = torch.tensor(dones, requires_grad=False, dtype=torch.float32)

            # Compute target values
            target_values = rewards + discount_factor * (1 - dones) * (target_values)

            # Compute loss function
            loss = nn.functional.mse_loss(
                values, target_values.detach())

            optimizer.zero_grad()
            # Compute gradient
            loss.backward()

            # Clip gradient norm to 1
            nn.utils.clip_grad_norm_(network.parameters(), max_norm=max_norm)

            # Perform backward pass (backpropagation)
            optimizer.step()

        if steps % C == 0:
            target_network.load_state_dict(network.state_dict())

        t += 1

    # Add rewards and number of steps
    episode_reward_list.append(total_episode_reward)
    episode_number_of_steps.append(t)

    if episode > 65:
        avg = running_average(episode_reward_list[-60:], n_ep_running_average)[-1]

    if avg > max_score:
        max_score = avg
        best_model.load_state_dict(network.state_dict())
        torch.save(best_model.state_dict(), "best_model.pth")
        print("In episode {}, best score is {}".format(episode, avg))

    EPISODES.set_description(
        "Episode {} - Reward/Steps: {:.1f}/{} - Avg. Reward/Steps: {:.1f}/{}".format(
            i, total_episode_reward, t,
            running_average(episode_reward_list, n_ep_running_average)[-1],
            running_average(episode_number_of_steps, n_ep_running_average)[-1]))

# Close all the windows
env.close()

# Plot Rewards and steps
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(16, 9))
ax[0].plot([i for i in range(1, N_episodes + 1)], episode_reward_list, label='Episode reward')
ax[0].plot([i for i in range(1, N_episodes + 1)], running_average(
    episode_reward_list, n_ep_running_average), label='Avg. episode reward')
ax[0].set_xlabel('Episodes')
ax[0].set_ylabel('Total reward')
ax[0].set_title('Total Reward vs Episodes')
ax[0].legend()
ax[0].grid(alpha=0.3)

ax[1].plot([i for i in range(1, N_episodes + 1)], episode_number_of_steps, label='Steps per episode')
ax[1].plot([i for i in range(1, N_episodes + 1)], running_average(
    episode_number_of_steps, n_ep_running_average), label='Avg. number of steps per episode')
ax[1].set_xlabel('Episodes')
ax[1].set_ylabel('Total number of steps')
ax[1].set_title('Total number of steps vs Episodes')
ax[1].legend()
ax[1].grid(alpha=0.3)
plt.show()

################################################################################

print('Done.')



# Random search

It is always tricky to tune this kind of model. Below, we implemented a random search over hyper-parameters that has managed to solve the problem in 100 iteration (200+ average reward over 50 episodes)


In [None]:
def function(parameters, iteration):
    
    
    N_episodes= parameters["N_episodes"]
    discount_factor= parameters["discount_factor"]
    L= parameters["L"]
    N= parameters["N"]
    C= int(L/N) 
    hidden_size= parameters["hidden_size"]
    method = parameters["method"]
    lr= parameters["lr"]
    max_norm= parameters["max_norm"]
    
    episode_reward_list = []       # this list contains the total reward per episode
    episode_number_of_steps = []   # this list contains the number of steps per episode

    ### Create Experience replay buffer ###
    buffer = ExperienceReplayBuffer(maximum_length=L)

    ### Filling up Buffer with Random experiences
    agent = RandomAgent(n_actions)

    for i in range(N):
        # Reset enviroment data and initialize variables
        done = False
        state = env.reset()

        while not done:
            # Take a random action
            action = agent.forward(state)

            # Get next state and reward.  The done variable
            # will be True if you reached the goal position,
            # False otherwise
            next_state, reward, done, _ = env.step(action)

            exp = Experience(state, action, reward, next_state, done)

            buffer.append(exp)

            # Update state for next iteration
            state = next_state

        # Close environment
        env.close()


    ### Create network ###
    network = Agent(input_size=dim_state, output_size=n_actions)

    target_network= Agent(input_size=dim_state, output_size=n_actions)
    target_network.load_state_dict(network.state_dict())

    ### Create optimizer ###
    optimizer = optim.Adam(network.parameters(), lr=lr)

    ### PLAY ENVIRONMENT ###
    # The next while loop plays 5 episode of the environment


    
    max_score= -1000
    avg= -1000
    #Steps to update target network
    steps=0
    
    #Initialize best model
    best_model= Agent(input_size=dim_state, output_size=n_actions)


    for episode in range(N_episodes):

        total_episode_reward=0
        state = env.reset()                    # Reset environment, returns
                                               # initial state
        done = False                           # Boolean variable used to indicate

        #define greedy epsilon
        Z= int(0.93*N_episodes)

        epsilon= epsilon_decay(episode,Z, eps_max=0.9, eps_min=0.05, method=method)


        # if an episode terminated

        t=0
        while not done:
            #env.render()                       # Render the environment, remove this
                                               # line if you run on Google Colab
            # Create state tensor, remember to use single precision (torch.float32)
            state_tensor = torch.tensor([state],
                                        requires_grad=False,
                                        dtype=torch.float32)


            explore = random.random() < epsilon

            if explore:
                action= np.random.randint(0, n_actions)


            else:
                # Compute output of the network
                with torch.no_grad():
                    values = network(state_tensor)
                    action = values.max(1)[1].item()




            # The next line takes permits you to take an action in the RL environment
            # env.step(action) returns 4 variables:
            # (1) next state; (2) reward; (3) done variable; (4) additional stuff
            next_state, reward, done, _ = env.step(action)

            # Update episode reward
            total_episode_reward += reward

            # Append experience to the buffer
            exp = Experience(state, action, reward, next_state, done)

            buffer.append(exp)
            
            # Update state for next iteration
            state = next_state

            ### TRAINING ###
            # Perform training only if we have more than 3 elements in the buffer
            if len(buffer) >= N:
                # Sample a batch of N elements
                states, actions, rewards, next_states, dones = buffer.sample_batch(n=N)

                # Training process, set gradients to 0
                optimizer.zero_grad()

                # Compute output of the network given the states batch
                values = network(torch.tensor(states,
                                requires_grad=True,
                                dtype=torch.float32))

                actions= torch.tensor(actions,
                                requires_grad=False,
                                dtype=torch.int64)
                action_masks = F.one_hot(actions, n_actions)

                values = (action_masks * values).sum(dim=-1)  #Computing Q(s,a) 

                #target values
                target_values=target_network(torch.tensor(next_states,
                                requires_grad=False,
                                dtype=torch.float32))

                target_values= target_values.max(1)[0] #max Q(si,a) for all the batch when target network is used

                rewards= torch.tensor(rewards, requires_grad=False,dtype=torch.float32)

                dones=  torch.tensor(dones, requires_grad=False,dtype=torch.float32)


                #Compute target values
                target_values= rewards + discount_factor * (1-dones) * (target_values)

                # Compute loss function
                loss = nn.functional.mse_loss(
                                values, target_values.detach())

                # Compute gradient
                loss.backward()

                # Clip gradient norm to 1
                nn.utils.clip_grad_norm_(network.parameters(), max_norm=max_norm)

                # Perform backward pass (backpropagation)
                optimizer.step()

                steps+=1


            if steps%C==0:
                target_network.load_state_dict(network.state_dict())

            t+=1

        #Add rewards and number of steps
        episode_reward_list.append(total_episode_reward)
        episode_number_of_steps.append(t)
        
        if episode>65:
            avg=running_average(episode_reward_list[-60:], n_ep_running_average)[-1]
        
        
        if avg>max_score:
            max_score =  avg
            best_model.load_state_dict(network.state_dict())
            
            



    # Write to the csv file ('a' means append)
    of_connection = open(out_file, 'a')
    writer = csv.writer(of_connection)
    writer.writerow([max_score, parameters, iteration])

    # Close all the windows
    env.close()
    
    return [max_score, parameters, iteration, best_model]

In [None]:

import csv
out_file = 'parameters_dqn.csv'
of_connection = open(out_file, 'w')
writer = csv.writer(of_connection)

# Write the headers to the file
writer.writerow(['reward', 'params', 'iteration'])
of_connection.close()

In [None]:

import random

param_grid = {
    'N_episodes': list(range(100, 1000)),
    'discount_factor': list(np.linspace(0.2, 1)),
    'L': list(range(20000, 30000)),
    'N': list(range(4, 124)),
    'hidden_size': list(range(8, 124)),
    'method': ["linear","exp"],
    'lr': list(np.linspace(0.00001, 0.0001)),
    'max_norm': list(np.linspace(0.5, 2)),
    
}

MAX_EVALS=600
# Dataframe to hold cv results
random_results = pd.DataFrame(columns = ['reward', 'params', 'iteration'],
                       index = list(range(MAX_EVALS)))

In [None]:
import random
random.seed(50)
final_model= Agent(input_size=dim_state, output_size=n_actions)
torch.save(final_model.state_dict(), "best_model.pt")
best=-1000
# Iterate through the specified number of evaluations
for i in range(MAX_EVALS):
    
    print("Step: {}".format(i))
    # Randomly sample parameters for gbm
    params = {key: random.sample(value, 1)[0] for key, value in param_grid.items()}
    
    print(params)

    results_list = function(params, i)
    
    avg=results_list[0]
    
    if avg>best:
        best=avg
        final_model= results_list[-1]
        torch.save(final_model.state_dict(), "best_model.pt")
    
    print(results_list[:-1])
    
    # Add results to next row in dataframe
    random_results.loc[i, :] = results_list[:-1]

In [None]:
random_results.loc[0, :] = results_list[:-1]

In [None]:
# Sort results by best validation score
random_results.sort_values('reward', ascending = False, inplace = True)
random_results.reset_index(inplace = True, drop = True)
random_results.head()

In [None]:
random_results.loc[0, 'params']

In [None]:
# Plot Rewards and steps
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(16, 9))
ax[0].plot([i for i in range(1, N_episodes+1)], episode_reward_list, label='Episode reward')
ax[0].plot([i for i in range(1, N_episodes+1)], running_average(
    episode_reward_list, n_ep_running_average), label='Avg. episode reward')
ax[0].set_xlabel('Episodes')
ax[0].set_ylabel('Total reward')
ax[0].set_title('Total Reward vs Episodes')
ax[0].legend()
ax[0].grid(alpha=0.3)


ax[1].plot([i for i in range(1, N_episodes+1)], episode_number_of_steps, label='Steps per episode')
ax[1].plot([i for i in range(1, N_episodes+1)], running_average(
    episode_number_of_steps, n_ep_running_average), label='Avg. number of steps per episode')
ax[1].set_xlabel('Episodes')
ax[1].set_ylabel('Total number of steps')
ax[1].set_title('Total number of steps vs Episodes')
ax[1].legend()
ax[1].grid(alpha=0.3)
plt.show()


# Check solution

In [None]:
# Parameters
N_EPISODES = 50            # Number of episodes to run for trainings
CONFIDENCE_PASS = 50

# Reward
episode_reward_list = []  # Used to store episodes reward

model=best_model
# Simulate episodes
print('Checking solution...')
EPISODES = trange(N_EPISODES, desc='Episode: ', leave=True)
for i in EPISODES:
    EPISODES.set_description("Episode {}".format(i))
    # Reset enviroment data
    done = False
    state = env.reset()
    total_episode_reward = 0.
    while not done:
        # Get next state and reward.  The done variable
        # will be True if you reached the goal position,
        # False otherwise
        q_values = model(torch.tensor([state]))
        _, action = torch.max(q_values, axis=1)
        next_state, reward, done, _ = env.step(action.item())

        # Update episode reward
        total_episode_reward += reward

        # Update state for next iteration
        state = next_state

    # Append episode reward
    episode_reward_list.append(total_episode_reward)

    # Close environment
    env.close()

avg_reward = np.mean(episode_reward_list)
confidence = np.std(episode_reward_list) * 1.96 / np.sqrt(N_EPISODES)


print('Policy achieves an average total reward of {:.1f} +/- {:.1f} with confidence 95%.'.format(
                avg_reward,
                confidence))

if avg_reward - confidence >= CONFIDENCE_PASS:
    print('Your policy passed the test!')
else:
    print("Your policy did not pass the test! The average reward of your policy needs to be greater than {} with 95% confidence".format(CONFIDENCE_PASS))
