In [1]:
# https://towardsdatascience.com/gradient-free-reinforcement-learning-neuroevolution-using-numpy-7377d6f6c3ea

import gym
import numpy as np


In [2]:
def relu(x):
    return np.where(x>0,x,0)

def softmax(x):
    x = np.exp(x - np.max(x))
    x[x==0] = 1e-15
    return np.array(x / x.sum())

In [3]:
# Lets define a new neural network class that can interact with gym
class NeuralNet:
    def __init__(
        self,
        n_units=None,
        copy_network=None,
        var=0.02,
        episodes=50,
        max_episode_length=200,
    ):  # Testing if we need to copy a network
        if copy_network is None:
            # Saving attributes
            self.n_units = n_units
            # Initializing empty lists to hold matrices
            weights = []
            biases = []  # Populating the lists
            for i in range(len(n_units) - 1):
                weights.append(
                    np.random.normal(loc=0, scale=1, size=(n_units[i], n_units[i + 1]))
                )
                biases.append(np.zeros(n_units[i + 1]))
            # Creating dictionary of parameters
            self.params = {"weights": weights, "biases": biases}

        else:
            # Copying over elements
            self.n_units = copy_network.n_units
            self.params = {
                "weights": np.copy(copy_network.params["weights"]),
                "biases": np.copy(copy_network.params["biases"]),
            }
            # Mutating the weights
            self.params["weights"] = [
                x + np.random.normal(loc=0, scale=var, size=x.shape)
                for x in self.params["weights"]
            ]
            self.params["biases"] = [
                x + np.random.normal(loc=0, scale=var, size=x.shape)
                for x in self.params["biases"]
            ]

    def act(self, X):
        # Grabbing weights and biases
        weights = self.params["weights"]
        biases = self.params["biases"]
        # First propgating inputs
        a = relu((X @ weights[0]) + biases[0])
        # Now propogating through every other layer
        for i in range(1, len(weights)):
            a = relu((a @ weights[i]) + biases[i])
        # Getting probabilities by using the softmax function
        probs = softmax(a)
        return np.argmax(probs)

    # Defining the evaluation method
    def evaluate(self, episodes, max_episode_length, render_env, record):
        # Creating empty list for rewards
        rewards = []  # First we need to set up our gym environment
        env = gym.make("CartPole-v0")  # Recording video if we need to
        if record is True:
            env = gym.wrappers.Monitor(env, "recording")  # Increasing max steps
        env._max_episode_steps = 1e20
        for i_episode in range(episodes):
            observation = env.reset()
            for t in range(max_episode_length):
                if render_env is True:
                    env.render()
                observation, _, done, _ = env.step(self.act(np.array(observation)))
                if done:
                    rewards.append(t)
                    break
        # Closing our enviroment
        env.close()
        # Getting our final reward
        if len(rewards) == 0:
            return 0
        else:
            return np.array(rewards).mean()


In [4]:
nn = NeuralNet((4, 16, 2))


In [8]:
for row in nn.params["weights"]:
    print(row)


[[ 1.35163077 -0.34792862  0.95637791 -1.20826669 -0.23091117 -1.45033488
  -0.20408373 -0.53377436 -1.10061192  2.02057101 -0.95428455  0.08806834
   1.53036936  1.2203435  -0.68148    -2.50648532]
 [ 0.92755609 -1.08481487 -0.65936592  0.37746139  1.51445279  1.62985334
   0.25903743  1.74319468 -1.27386086 -0.53726439 -0.87206817  0.98973355
  -0.49018017 -1.6987895  -0.34723071  0.56179819]
 [ 0.9223656  -0.87064171 -0.92844136  1.24608851  0.44982904  0.61723193
  -1.20774945 -0.62646413 -0.4579802   0.0866462   0.1346366   0.00510909
   0.53342258  0.33625185 -1.82580301  2.22146275]
 [ 0.69311095 -0.30376808 -0.0832973  -1.71598737 -0.33147245 -0.89043369
  -0.31821008 -0.53566177  0.4227945  -0.38510081 -0.36170623 -0.63688291
   0.76716855  0.58411951  1.13941847  2.79467444]]
[[ 0.14427202 -0.6772839 ]
 [ 1.3838281  -0.43717594]
 [-0.44153868 -2.13686634]
 [-0.4281652   0.0889833 ]
 [-1.37492218 -0.51241493]
 [-0.85427713 -0.40830718]
 [ 0.14342792 -1.73352327]
 [-0.75019743 

In [21]:
# Defining our class that handles populations of networks
class GeneticNetworks:

    # Defining our initialization method
    def __init__(
        self,
        architecture=(4, 16, 2),
        population_size=50,
        generations=500,
        render_env=True,
        record=False,
        mutation_variance=0.02,
        verbose=False,
        print_every=1,
        episodes=10,
        max_episode_length=200,
    ):
        # Creating our list of networks
        self.networks = [NeuralNet(architecture) for _ in range(population_size)]
        self.population_size = population_size
        self.generations = generations
        self.mutation_variance = mutation_variance
        self.verbose = verbose
        self.print_every = print_every
        self.fitness = []
        self.episodes = episodes
        self.max_episode_length = max_episode_length
        self.render_env = render_env
        self.record = record

    # Defining our fitting method
    def fit(self):
        # Iterating over all generations
        for i in range(self.generations):
            # Doing our evaluations
            rewards = np.array(
                [
                    x.evaluate(
                        self.episodes,
                        self.max_episode_length,
                        self.render_env,
                        self.record,
                    )
                    for x in self.networks
                ]
            )
            # Tracking best score per generation
            self.fitness.append(np.max(rewards))
            # Selecting the best network
            best_network = np.argmax(rewards)
            # Creating our child networks
            new_networks = [
                NeuralNet(
                    copy_network=self.networks[best_network],
                    var=self.mutation_variance,
                    max_episode_length=self.max_episode_length,
                )
                for _ in range(self.population_size - 1)
            ]
            # Setting our new networks
            self.networks = [self.networks[best_network]] + new_networks
            # Printing output if necessary
            if self.verbose is True and (i % self.print_every == 0 or i == 0):
                print(
                    "Generation:",
                    i + 1,
                    "| Highest Reward:",
                    rewards.max().round(1),
                    "| Average Reward:",
                    rewards.mean().round(1),
                )

        # Returning the best network
        self.best_network = self.networks[best_network]


In [22]:
# Lets train a population of networks

from time import time
start_time = time()
genetic_pop = GeneticNetworks(
    architecture=(4,16,2),
    population_size=64, 
    generations=5,
    episodes=15, 
    mutation_variance=0.1,
    max_episode_length=10000,
    render_env=False,
    verbose=True
)
genetic_pop.fit()
print('Finished in',round(time()-start_time,3),'seconds')

  return array(a, order=order, subok=subok, copy=True)


Generation: 1 | Highest Reward: 444.5 | Average Reward: 35.0
Generation: 2 | Highest Reward: 1332.0 | Average Reward: 229.6
Generation: 3 | Highest Reward: 1590.1 | Average Reward: 388.6
Generation: 4 | Highest Reward: 8699.5 | Average Reward: 663.6
Generation: 5 | Highest Reward: 3801.7 | Average Reward: 512.2
Finished in 413.683 seconds


In [23]:
# Lets observe our best network
genetic_pop.best_network.evaluate(episodes=3, max_episode_length=int(1e10), render_env=True, record=False)