In [0]:
#Cartpole

import numpy as np
import gym

np.random.seed(0)

env = gym.make("CartPole-v1")
print(env.action_space)
n_actions = env.action_space.n

Discrete(2)


In [0]:
def sigmoid_action(state,weight):
    state = np.hstack((np.ones(1),state)) #Add 1 to the matrix
    sigmoid = 1/(1+np.exp(-(np.dot(state.T,weight)))) #Calculate sigmoid
    probs = np.array([1-sigmoid,sigmoid]) #Create probability distribution
    action = np.random.choice(np.arange(0,2),p=probs) #Choose action
    return action

def play_episode(w, t_max=1000):
  states, actions = [], []
  total_reward = 0

  s = env.reset()
  for t in range(t_max):
    action = sigmoid_action(s,w)
    new_s, r, done, info = env.step(action)

    states.append(s)
    actions.append(action)
    total_reward += r

    s = new_s
    if done:
        break
  return states, actions, total_reward

In [0]:
# hyperparameters
npop = 8 # population size
sigma = 0.1 # noise standard deviation
alpha = 0.02 # learning rate
top_perf = 4 # Top performing directions to choose from

w = np.zeros(5) # Our initial parameters are 0-s
for i in range(10000):

  # print weight and reward every 50 iterations
  if i%50==0:
    print('iter %d. w: %s, reward: %f' % 
          (i, str(w), play_episode(w)[2]))

  # initialize memory for a population of w's, and their rewards
  N = np.random.randn(npop, 5) # samples from a normal distribution N(0,1)
  R_plus = np.zeros(npop) # Create arrays filled with zeros for R+
  R_minus = np.zeros(npop) # Create arrays filled with zeros for R-
  for j in range(npop):
    w_plus = w + sigma*N[j] # jitter w+ using gaussian of sigma 0.1
    w_minus = w - sigma*N[j] # jitter w- using gaussian of sigma 0.1
    R_plus[j] = play_episode(w_plus)[2] # Evaluate parameters w+
    R_minus[j] = play_episode(w_minus)[2] #Evaluate parameters for w-

  # Calculate standart deviations
  all_rewards = np.array(R_minus + R_plus) 
  sigma_r = all_rewards.std()
  
  #Choose best performing directions
  merged = np.array([R_plus,R_minus]).T
  best_indx = np.argsort(-np.amax(merged,1))[:top_perf]
  R_plus = R_plus[best_indx]
  R_minus = R_minus[best_indx]
  N = N[best_indx]
  
  # perform the parameter update.
  w = w + (alpha)/(sigma_r*top_perf) * np.dot(N.T, (R_plus-R_minus))