# CartPole RS/ES/GA

Originally planned to try RS, ES, Hill-Climbing, GA. However, task seems to be so simple that anything other than random search is largely ineffective.

In [1]:
import gym
import numpy as np
from itertools import count

env = gym.make('CartPole-v0').unwrapped

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [2]:
class LinearModel():
    
    def __init__(self, shape=(4, 2)):
        stdev = 1 / np.sqrt(shape[0])
        self.weights = np.random.uniform(-stdev, stdev, size=shape)
    
    def __call__(self, x):
        acts = x.dot(self.weights)
        return acts

def get_action(action_values, eps=0.1):
    if np.random.uniform() < eps:
        return np.random.binomial(1, 0.5)
    else:
        return action_values.argmax()

In [3]:
def run_episode(model, eps=0.1, max_t=250, render=False):
    env.reset()
    state = env.state
    
    for t in count():
        action_values = model(state)
        action = get_action(action_values, eps)
        state, reward, done, _ = env.step(action)
        
        if render:
            env.render()
            
        if done or t >= max_t:
            return t

In [4]:
def avg_reward(model, num_episodes=100):
    avg_reward = sum(run_episode(model, eps=0.0, max_t=1000)
                     for _ in range(num_episodes)) / num_episodes
    return avg_reward

def random_search(num_episodes):
    best_reward = 0
    best_model = None
    
    for _ in range(num_episodes):
        model = LinearModel()
        reward = run_episode(model, max_t=1000)

        if reward > best_reward:
            best_reward = reward
            best_model = model
    
    return best_model

# weight perturbations used in ES and GA
def perturbation(shape):
    stdev = 1 / np.sqrt(shape[0])
    return np.random.normal(scale=stdev, size=shape)

def evolution_strategies(num_iters, npop, sigma=0.1, alpha=0.1):
    
    model = LinearModel()
    model.weights = np.zeros_like(model.weights)
    
    for _ in range(num_iters):
        fitness_curr= run_episode(model)
        perterb = sigma * np.array([perturbation(model.weights.shape)
                                    for _ in range(npop)])
        fitness = np.zeros(npop)
        
        for i in range(npop):
            model.weights += perterb[i]
            fitness[i] = run_episode(model)
            model.weights -= perterb[i]
        
#         print(fitness.mean(), fitness.std())
        fitness = (fitness - fitness_curr) / fitness_curr
        gradient = perterb.transpose(1, 2, 0).dot(fitness)
        gradient *= alpha / (npop * sigma)
        model.weights += gradient
    
    return model

In [8]:
model_RS = random_search(80)
model_ES = evolution_strategies(8, 10)

In [5]:
fitness_RS = sum(avg_reward(random_search(80)) for _ in range(25))/25
fitness_ES = sum(avg_reward(evolution_strategies(8, 10)) for _ in range(25))/25
print(fitness_RS)
print(fitness_ES)

886.1072
555.6339999999999


In [10]:
run_episode(model_ES, eps=0.0, max_t=1000, render=True)

1000

In [None]:
env.close()