In [2]:
import numpy as np
import time
import gym
render = False # to control rendering of the learning process, It will take very long if you render the learning process.
episode_len = 200  # length of a single episode
number_of_episodes = 10  # evaluate a policy over 10 episodes


def run_episode(env, policy):
    total_reward = 0
    done = False
    observation = env.reset() 
    while True:
        if render:
            env.render()
        action = 0 if Multiplyweights(policy, observation) < 0 else 1 # we have a binary set of actions, move left or right.
        observation, reward, done, info = env.step(action)
        total_reward += reward
        if done:
            break
    if render:
        env.close()
    return total_reward


def Multiplyweights(policy, observation):
    sum=0
    for x in range(4):
        sum += (policy[x] * observation[x])
    return sum


def evaluate_policy(env, policy):
    total_rewards = 0.0
    for _ in range(number_of_episodes):
        total_rewards += run_episode(env, policy)
    return ((total_rewards/2000)*100)


def gen_random_policy():
    return np.random.rand(4) * 2 - 1


def crossover(policy1, policy2):  # 20% chance of switching each attribute 
    new_policy = policy1.copy()
    for i in range(4):
        rand = np.random.uniform()
        if rand > 0.79:
            new_policy[i] = policy2[i]
    return new_policy


def mutation(policy):  # less than 10% chance of changing 1 segment
    new_policy = policy.copy()
    for i in range(4):
        rand = np.random.uniform()  # check this
        if rand < 0.09:
            new_policy[i] = np.random.rand(1) * 2 - 1
    return new_policy


env = gym.make("CartPole-v1")
best_policy = None
n_policy = 100  # Number of Policies to start with
n_steps = 5  # number of generations we want to go to
policy_pop = [gen_random_policy() for _ in range(n_policy)]  # 2d array of our 100 policies
policy_scores = []
for idx in range(n_steps):
    policy_scores = [evaluate_policy(env, p) for p in policy_pop]  # TO store the score of each Policy
    print('Generation %d : max score = %0.2f' % (idx + 1, max(policy_scores)))
    policy_ranks = list(reversed(np.argsort(policy_scores)))  # List to store the index position of  policies in decending order
    elite_set = [policy_pop[x] for x in policy_ranks[:5]]  # TO store the Top 5 policies
    select_probs = np.array(policy_scores) / np.sum(policy_scores)  # Dividing each policy score with total policy score
    child_set = [crossover(
        policy_pop[np.random.choice(range(n_policy), p=select_probs)],
        policy_pop[np.random.choice(range(n_policy), p=select_probs)])
        for _ in range(n_policy - 5)]
    mutated_list = [mutation(p) for p in child_set]  # creating  a mutated list from the policies os the child set
    policy_pop = elite_set  # changing our policy set to elite set or say shortening our list to only elite onces
    policy_pop += mutated_list
    policy_score = [evaluate_policy(env, p) for p in policy_pop]
    best_policy = policy_pop[np.argmax(policy_score)]  # select the best policy by index of the max policy score  

observation = env.reset()
for _ in range(1000): # Testing!!
    action = 0 if Multiplyweights(best_policy, observation) < 0 else 1
    env.render()
    observation, reward, done, info = env.step(action)
env.close()

Generation 1 : max score = 241.75
Generation 2 : max score = 250.00
Generation 3 : max score = 250.00
Generation 4 : max score = 250.00
Generation 5 : max score = 250.00
