To get these results, we used as a basis the blogpost in : https://towardsdatascience.com/reinforcement-learning-without-gradients-evolving-agents-using-genetic-algorithms-8685817d84f and the associated github : https://github.com/paraschopra/deepneuroevolution/blob/master/openai-gym-cartpole-neuroevolution.ipynb



# Import Useful Libraries

In [0]:
import numpy as np
import matplotlib.pyplot as plt
import time
import math
import copy

In [0]:
#gym library for Reinforcement Leaning Agents
import gym
from gym.wrappers import Monitor

In [18]:
#Pytorch Library for Deep Reinforcement Learning Implementation
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

#disable gradients as we will not use them
torch.set_grad_enabled(False)

<torch.autograd.grad_mode.set_grad_enabled at 0x7f95d8f70198>

# Initialization: Environment Set-up and Random Generation of Agents

We start by setting the environment and hyperparameters below. We want to generate 300 agents to get the initial state.



In [0]:
class CartPoleAI(nn.Module):
        def __init__(self):
            super().__init__()
            self.fc = nn.Sequential(
                        nn.Linear(4,32, bias=True),
                        nn.ReLU(),
                        nn.Linear(32,2, bias=True),
                        nn.Softmax(dim=1)
                        )

                
        def forward(self, inputs):
            x = self.fc(inputs)
            return x

In [0]:
nbre_actions = 2 #movement to the left or right

In [0]:
def return_random_agents(num_agents):
    
    agents = []
    for _ in range(num_agents):
        
        agent = CartPoleAI()
        
        for param in agent.parameters():
            param.requires_grad = False
            
        init_weights(agent)
        agents.append(agent)
        
        
    return agents
    

In [0]:
agents = return_random_agents(300)

# Computing the Rewards (fitness function)

The functions below run the agents n times and compute the resulting reward.

In [0]:
def run_agents(agents):
    
    reward_agents = []
    env = gym.make("CartPole-v0")
    
    for agent in agents:
        agent.eval()
    
        observation = env.reset()
        
        r=0
        
        for _ in range(250):
            
            state_observed = torch.tensor(observation).type('torch.FloatTensor').view(1,-1)
            choice_proba = agent(state_observed).detach().numpy()[0]
            next_action = np.random.choice(range(nbre_actions), 1, p=choice_proba).item()
            new_observation, reward, done, info = env.step(next_action)
            r=r+reward
            
            observation = new_observation

            if(done):
                break

        reward_agents.append(r)        
        #reward_agents.append(s)
        
    
    return reward_agents

In [0]:
def return_average_score(agent, runs):
    score = 0.
    for i in range(runs):
        score += run_agents([agent])[0]
    return score/runs

In [0]:
def run_agents_n_times(agents, runs):
    avg_score = []
    for agent in agents:
        avg_score.append(return_average_score(agent,runs))
    return avg_score

In [0]:
def mutate(agent):

    child_agent = copy.deepcopy(agent)
    
    mutation_power = 0.02
    
    for param in child_agent.parameters():
    
        if(len(param.shape)==4): #weights of Conv2D

            for i0 in range(param.shape[0]):
                for i1 in range(param.shape[1]):
                    for i2 in range(param.shape[2]):
                        for i3 in range(param.shape[3]):
                            
                            param[i0][i1][i2][i3]+= mutation_power * np.random.randn()
                                
                                    

        elif(len(param.shape)==2): #weights of linear layer
            for i0 in range(param.shape[0]):
                for i1 in range(param.shape[1]):
                    
                    param[i0][i1]+= mutation_power * np.random.randn()
                        

        elif(len(param.shape)==1): #biases of linear layer or conv layer
            for i0 in range(param.shape[0]):
                
                param[i0]+=mutation_power * np.random.randn()

    return child_agent

In [0]:
def return_children(agents, sorted_parent_indexes, elite_index):
    
    children_agents = []
    
    #first take selected parents from sorted_parent_indexes and generate N-1 children
    for i in range(len(agents)-1):
        
        selected_agent_index = sorted_parent_indexes[np.random.randint(len(sorted_parent_indexes))]
        children_agents.append(mutate(agents[selected_agent_index]))

    #now add one elite
    elite_child = add_elite(agents, sorted_parent_indexes, elite_index)
    children_agents.append(elite_child)
    elite_index=len(children_agents)-1 #it is the last one
    
    return children_agents, elite_index

In [0]:
def add_elite(agents, sorted_parent_indexes, elite_index=None, only_consider_top_n=10):
    
    candidate_elite_index = sorted_parent_indexes[:only_consider_top_n]
    
    if(elite_index is not None):
        candidate_elite_index = np.append(candidate_elite_index,[elite_index])
        
    top_score = None
    top_elite_index = None
    
    for i in candidate_elite_index:
        score = return_average_score(agents[i],runs=5)
        
        if(top_score is None):
            top_score = score
            top_elite_index = i
        elif(score > top_score):
            top_score = score
            top_elite_index = i
            
    
    child_agent = copy.deepcopy(agents[top_elite_index])
    return child_agent
    

In [52]:
# Number of agents to consider as the Fittest for Natural Selection
number_fittest = 15

# run evolution until X generations (the results are improved with 1000 generations)
generations = 100

elite_index = None

for generation in range(generations):

    # return rewards of agents
    rewards = run_agents_n_times(agents, 3) #return average of 3 runs

    # sort by rewards
    sorted_parent_indexes = np.argsort(rewards)[::-1][:number_fittest] #reverses and gives top values (argsort sorts by ascending by default) https://stackoverflow.com/questions/16486252/is-it-possible-to-use-argsort-in-descending-order

    
    top_rewards = []
    for best_parent in sorted_parent_indexes:
        top_rewards.append(rewards[best_parent])
    
    print("### Generation number {} ### ".format(generation))
    print("Rewards for the top {} fittest agents: {}".format(number_fittest, top_rewards))
    print("mean reward is {}".format(np.mean(np.array(top_rewards))))
    
    # setup an empty list for containing children agents
    children_agents, elite_index = return_children(agents, sorted_parent_indexes, elite_index)

    # kill all agents, and replace them with their children
    agents = children_agents

### Generation number 0 ### 
Rewards for the top 15 fittest agents: [70.0, 69.66666666666667, 63.0, 62.333333333333336, 62.0, 56.666666666666664, 56.666666666666664, 56.333333333333336, 56.333333333333336, 56.0, 55.666666666666664, 55.0, 54.666666666666664, 54.666666666666664, 52.333333333333336]
mean reward is 58.75555555555555
### Generation number 1 ### 
Rewards for the top 15 fittest agents: [85.0, 67.0, 67.0, 66.33333333333333, 66.33333333333333, 61.666666666666664, 61.333333333333336, 60.666666666666664, 60.333333333333336, 59.0, 58.0, 58.0, 57.666666666666664, 56.666666666666664, 55.666666666666664]
mean reward is 62.7111111111111
### Generation number 2 ### 
Rewards for the top 15 fittest agents: [68.66666666666667, 66.0, 64.0, 63.0, 63.0, 59.333333333333336, 59.333333333333336, 58.666666666666664, 56.666666666666664, 56.0, 56.0, 55.666666666666664, 55.333333333333336, 54.666666666666664, 54.666666666666664]
mean reward is 59.39999999999999
### Generation number 3 ### 
Rewards 

KeyboardInterrupt: ignored