In [1]:
# 4 input nodes
# ReLu
# 16 hidden layer nodes
# ReLu 
# 2 output nodes
# Softmax

In [2]:
import gym 
import pickle
import numpy as np
from time import time

  logger.warn(


In [3]:
"""
env = gym.make("CartPole-v0")
obs = env.reset()
"""


'\nenv = gym.make("CartPole-v0")\nobs = env.reset()\n'

In [4]:
"""
action = env.action_space.sample()
obs, reward, done, _ = env.step(action)
env.render()
print(obs)
"""

'\naction = env.action_space.sample()\nobs, reward, done, _ = env.step(action)\nenv.render()\nprint(obs)\n'

In [5]:
"""
env.close()
"""

'\nenv.close()\n'

### Demo of mutation
- a = [1, 2, 3, 4, 5]
- mut_var = .1
- b = [1.1, 1.9, 3.1, 3.9, 4.9]

- for i in range(len(a)):
    - a[i] += random.choice([-1, 1]) * mut_var

In [6]:
POPULATION = 32
GENERATIONS = 5

EPISODES = 8
MAX_EP_LEN = 100000

MUTATION_VARIANCE = 0.1

MODEL_PATH = "model.bin"

ARCH = (4, 16, 2)

In [7]:
def relu(a):
    return np.where(a > 0, a, 0)

def softmax(a):
    # https://www.pythonpool.com/numpy-softmax/#:~:text=Numpy%20softmax%20is%20a%20mathematical%20function%20that%20takes,It%20normalizes%20an%20input%20to%20a%20probability%20distribution.
    return np.exp(a) / np.sum(np.exp(a))

def time_elapsed(prev, cur, round_digit=8):
    delta_time = cur - prev
    return f"\tTime elapsed: {round(delta_time, round_digit)} seconds"


In [8]:
class NN:
    def __init__(self, arch, net_to_copy=None, mut_var=MUTATION_VARIANCE):
        """
        Architecture

        4 Input nodes with 16 weights each to 16 hidden layer nodes
        16 Hidden nodes with 2 weights each to 2 output layer nodes

        16 Biases for each of the hidden layer nodes
        2 Biases for each of the output layer nodes

        (1, 16) -> 1 Layer of 16 Nodes

        """

        if net_to_copy is None:
            weights = [
                np.random.rand(arch[1], arch[0]),
                np.random.rand(arch[2], arch[1]),
            ]

            biases = [
                np.zeros((arch[1],)),
                np.zeros((arch[2],)),
            ]

        else:
            weights = net_to_copy.weights
            biases = net_to_copy.biases

            weights = [
                weight + np.random.normal(0.0, mut_var, weight.shape)
                for weight in weights
            ]
            biases = [
                bias + np.random.normal(0.0, mut_var, bias.shape) for bias in biases
            ]
            
        # print([weight.shape for weight in weights])
        # print([bias.shape for bias in biases])

        self.weights = weights
        self.biases = biases

    def forward(self, _obs):
        weights = self.weights
        biases = self.biases

        _obs = np.array(_obs)

        hidden = relu(np.dot(weights[0], _obs) + biases[0])
        # print(hidden.shape)

        output = relu(np.dot(weights[1], hidden) + biases[1])
        # print(output.shape)

        # print(output)

        probabilities = softmax(output)
        return np.argmax(
            probabilities
        )  # Returns index of highest element --> 0, 1 (Perfect for this action space)

    def evaluate(
        self, episodes=EPISODES, max_ep_len=MAX_EP_LEN, render_env=False, record=False
    ):
        ep_rewards = []
        env = gym.make("CartPole-v0")

        if record:
            env = gym.wrappers.Monitor(env, "recording")

        env._max_episode_steps = max_ep_len

        for i_ep in range(episodes):
            _obs = env.reset()
            done = False
            score = 0

            while not done:
                if render_env:
                    env.render()
                    
                action = self.forward(_obs)
                _obs, reward, done, _ = env.step(action)
                score += reward

            ep_rewards.append(score)

        env.close()

        return np.array(ep_rewards).mean()


In [9]:
# nn = NN(ARCH)

In [10]:
# nn.forward([1, 2, 3, 4])


In [11]:
# nn.evaluate(render_env=True, record=True)

In [12]:
class GA:
    def __init__(
        self,
        arch=ARCH,
        pop=POPULATION,
        gens=GENERATIONS,
        mut_var=MUTATION_VARIANCE,
        verbose=False,
        eps=EPISODES,
        max_ep_len=MAX_EP_LEN,
        render_env=False,
        record=False,
    ):
        self.arch = arch
        self.pop = pop
        self.gens = gens
        self.mut_var = mut_var
        self.verbose = verbose
        self.eps = eps
        self.max_ep_len = max_ep_len
        self.render_env = render_env
        self.record = record

        self.nets = [NN(arch) for _ in range(pop)]

    def get_best_fit(self):
        fitness_arr = []
        for i, net in enumerate(self.nets):
            prev_time = time()

            fitness = self.evaluate(net)
            if self.verbose:
                print(
                    "Network:",
                    i + 1,
                    "\tFitness:",
                    round(fitness),
                    time_elapsed(prev_time, time()),
                )

            fitness_arr.append(fitness)

            if fitness == self.max_ep_len:
                break

        fitnesses = np.array(fitness_arr)
        i_best_net = np.argmax(fitness_arr)
        best_net_score = fitnesses[i_best_net]
        best_net = self.nets[i_best_net]

        # best_net.evaluate(self.eps, self.max_ep_len, True)

        return best_net, best_net_score, fitnesses

    def mutate_nets(self, best_net):
        new_nets = [NN(self.arch, best_net, self.mut_var) for _ in range(self.pop - 1)]
        return new_nets

    def train(self):
        for i in range(self.gens):
            prev_time = time()

            best_net, best_net_score, fitnesses = self.get_best_fit()

            if self.verbose:
                print(
                    "=====",
                    "Generation:",
                    i + 1,
                    "\tMaximum Reward:",
                    round(fitnesses.max()),
                    "\tAverage Reward:",
                    round(fitnesses.mean()),
                    time_elapsed(prev_time, time(), 3),
                    "=====",
                    "\n",
                )

            # print(best_net_score)
            if best_net_score == self.max_ep_len:
                print("Callback reached: Max score obtained.")
                break

            new_nets = self.mutate_nets(best_net)

            self.nets = [best_net] + new_nets

        self.best_net = best_net

    def load(self, path=MODEL_PATH):
        with open(path, "rb") as f:
            self.best_net = pickle.load(f)

    def save(self, path=MODEL_PATH):
        with open(path, "wb") as f:
            pickle.dump(self.best_net, f)

    def evaluate(self, net):
        mean_rewards = net.evaluate(
            self.eps, self.max_ep_len, self.render_env, self.record
        )
        return mean_rewards


In [13]:
# ga = GA(verbose=True)
# model_path = MODEL_PATH

In [14]:
# ga = GA(verbose=True, arch=(4, 8, 2), max_ep_len=10000, pop=64)
# model_path = "model-8 nodes-10k eps.bin"

"""
===== Generation: 2 	Maximum Reward: 10000 	Average Reward: 2734 	Time elapsed: 10.692 seconds ===== 
"""


'\n===== Generation: 2 \tMaximum Reward: 10000 \tAverage Reward: 2734 \tTime elapsed: 10.692 seconds ===== \n'

In [15]:
# ga = GA(verbose=True, arch=(4, 4, 2), max_ep_len=5000, pop=64, eps=2)
# model_path = "model-4 nodes-5k eps.bin"

"""
===== Generation: 1 	Maximum Reward: 5000 	Average Reward: 170 	Time elapsed: 0.814 seconds =====
"""

'\n===== Generation: 1 \tMaximum Reward: 5000 \tAverage Reward: 170 \tTime elapsed: 0.814 seconds =====\n'

In [16]:
ga = GA(verbose=True, arch=(4, 1, 2), max_ep_len=200, pop=64, eps=1)
model_path = "model-too fast.bin"

"""
===== Generation: 1 	Maximum Reward: 200 	Average Reward: 104 	Time elapsed: 0.018 seconds =====
"""

'\n===== Generation: 1 \tMaximum Reward: 200 \tAverage Reward: 104 \tTime elapsed: 0.018 seconds =====\n'

In [17]:
# Train and save model
# ga.train()
# ga.save(model_path)

In [20]:
ga.load(model_path)

record = False

if record:
    ga.best_net.evaluate(1, 200, True, True) # Record only records 200 steps
else:
    ga.best_net.evaluate(1, 1000000, True)


KeyboardInterrupt: 