In [1]:

# https://www.gymlibrary.ml/environments/box2d/bipedal_walker/

import gym 
import pickle
import random
import numpy as np 
from time import time

  logger.warn(


In [2]:
POP = 64
GENS = 8
KEEP_PERCENTAGE = 0.25

EPS = 2
MAX_EP_LEN = 2000
GOAL = 300

MUT_VAR = 0.08

MODEL_PATH = "model.bin"

ARCH = (24, 64, 4)

In [3]:
rng = np.random.default_rng(314)
def generate_random(size):
    rand_arr = rng.random(size)
    rand_signs = rng.choice([-1, 1], size)
    return rand_arr * rand_signs


def sigmoid(a):
    return 1/(1 + np.exp(-a))

# Vectorize sigmoid function 
sigmoid_v = np.vectorize(sigmoid)


def time_elapsed(prev, cur, round_digit=8):
    delta_time = cur - prev
    return f"\tTime elapsed: {round(delta_time, round_digit)} seconds"


In [4]:
# env = gym.make("BipedalWalker-v3")
# obs = env.reset()

# print(obs)

In [5]:
# env.render()

In [6]:
# env.step((-1, -1, -1, -1))

In [7]:
class NN:
    def __init__(self, arch, net_to_copy=None, mut_var=MUT_VAR):
        if net_to_copy is None:
            weights = [
                generate_random((arch[1], arch[0])),
                generate_random((arch[2], arch[1])),
            ]

            biases = [
                generate_random((arch[1],)),
                generate_random((arch[2],)),
            ]

        else:
            weights = net_to_copy.weights
            biases = net_to_copy.biases

            weights = [
                weight + np.random.normal(0.0, mut_var, weight.shape)
                for weight in weights
            ]
            biases = [
                bias + np.random.normal(0.0, mut_var, bias.shape) for bias in biases
            ]

        self.weights = weights
        self.biases = biases

    def forward(self, _obs):
        weights = self.weights
        biases = self.biases

        _obs = np.array(_obs)

        hidden = np.dot(weights[0], _obs) + biases[0]
        
        # print("Hidden:", hidden)

        output = np.dot(weights[1], hidden) + biases[1]
        
        # print("Output:", output)

        action = sigmoid_v(output)
        # print("Action:", action)
        return action

    def evaluate(
        self, episodes=EPS, max_ep_len=MAX_EP_LEN, render_env=False, record=False
    ):
        ep_rewards = []
        env = gym.make("BipedalWalker-v3")

        if record:
            env = gym.wrappers.Monitor(env, "recording")

        env._max_episode_steps = max_ep_len

        for i_ep in range(episodes):
            _obs = env.reset()
            done = False
            score = 0

            while not done:
                if render_env:
                    env.render()

                action = self.forward(_obs)

                _obs, reward, done, _ = env.step(action)
                score += reward

            ep_rewards.append(score)

        env.close()

        return np.array(ep_rewards).mean()


In [8]:
# nn = NN(ARCH)
# print([weight for weight in nn.weights])

In [9]:
# env = gym.make("BipedalWalker-v3")
# obs = env.reset()


In [10]:
# nn.weights

In [11]:
# action = nn.forward(obs)
# env.step(action)
# env.render()

In [12]:
class GA:
    def __init__(
        self,
        arch=ARCH,
        pop=POP,
        mut_var=MUT_VAR,
        verbose=False,
        eps=EPS,
        max_ep_len=MAX_EP_LEN,
        goal=GOAL,
        render_env=False,
        record=False,
    ):
        self.arch = arch
        self.pop = pop
        self.mut_var = mut_var
        self.verbose = verbose
        self.eps = eps
        self.max_ep_len = max_ep_len
        self.goal = goal
        self.render_env = render_env
        self.record = record

        self.nets = [NN(arch) for _ in range(pop)]

    def get_best_fit(self):
        fitness_arr = []
        sorted_nets = []

        for i, net in enumerate(self.nets):
            prev_time = time()

            fitness = self.evaluate(net)
            if self.verbose:
                print(
                    "Network:",
                    i + 1,
                    "\tFitness:",
                    round(fitness),
                    time_elapsed(prev_time, time()),
                )

            # Insertion sort
            inserted = False
            for i in range(len(fitness_arr)):
                if fitness_arr[i] > fitness:
                    fitness_arr.insert(i, fitness)
                    sorted_nets.insert(i, net)
                    inserted = True
                    break

            if not inserted:
                fitness_arr.append(fitness)
                sorted_nets.append(net)

            if fitness == self.goal:
                break

        fitnesses = np.array(fitness_arr)

        return sorted_nets, fitnesses

    def mutate_nets(self, parents):
        new_nets = [
            NN(self.arch, random.choice(parents), self.mut_var)
            for _ in range(self.pop - len(parents))
        ]
        return new_nets

    def train(self, gens=GENS):
        for i in range(gens):
            prev_time = time()

            nets, fitnesses = self.get_best_fit()

            if self.verbose:
                print(
                    "=====",
                    "Generation:",
                    i + 1,
                    "\tMaximum Reward:",
                    round(fitnesses.max()),
                    "\tAverage Reward:",
                    round(fitnesses.mean()),
                    time_elapsed(prev_time, time(), 3),
                    "=====",
                    "\n",
                )

            self.best_net = nets[-1]

            if fitnesses[-1] == self.goal:
                print("Callback reached: Max score obtained.")
                break

            parents = nets[int(POP * (1 - KEEP_PERCENTAGE)) :]
            children = self.mutate_nets(parents)

            self.nets = parents + children

    def load(self, path=MODEL_PATH):
        with open(path, "rb") as f:
            self.nets = pickle.load(f)

    def save(self, path=MODEL_PATH):
        with open(path, "wb") as f:
            pickle.dump(self.nets, f)

    def evaluate(self, net):
        mean_rewards = net.evaluate(
            self.eps, self.max_ep_len, self.render_env, self.record
        )
        return mean_rewards


In [13]:
ga = GA(verbose=True)
model_path = MODEL_PATH

In [14]:
ga.train()

Network: 3 	Fitness: -108 	Time elapsed: 0.28922606 seconds
Network: 4 	Fitness: -115 	Time elapsed: 0.25033045 seconds
Network: 5 	Fitness: -116 	Time elapsed: 0.25531769 seconds
Network: 6 	Fitness: -103 	Time elapsed: 0.24035692 seconds
Network: 7 	Fitness: -140 	Time elapsed: 1.38826632 seconds
Network: 8 	Fitness: -112 	Time elapsed: 0.19048977 seconds
Network: 9 	Fitness: -110 	Time elapsed: 0.19946527 seconds
Network: 10 	Fitness: -103 	Time elapsed: 0.21941376 seconds
Network: 11 	Fitness: -119 	Time elapsed: 0.38596725 seconds
Network: 12 	Fitness: -110 	Time elapsed: 0.23036718 seconds
Network: 13 	Fitness: -114 	Time elapsed: 1.32149673 seconds
Network: 14 	Fitness: -108 	Time elapsed: 0.21243191 seconds
Network: 15 	Fitness: -112 	Time elapsed: 0.23237944 seconds
Network: 16 	Fitness: -109 	Time elapsed: 0.11768413 seconds
Network: 17 	Fitness: -107 	Time elapsed: 0.16854787 seconds
Network: 18 	Fitness: -115 	Time elapsed: 0.13663459 seconds
Network: 19 	Fitness: -117 	Tim

In [15]:
ga.save(model_path)

In [17]:
ga.load(model_path)

record = False

if record:
    ga.best_net.evaluate(1, 200, True, True) # Record only records 200 steps
else:
    ga.nets[10].evaluate(1, 10000, True)

KeyboardInterrupt: 