In [5]:
import gym
import numpy as np
import random
np.random.seed(333)
env = gym.make('Marvin-v0')
observation = env.reset()

# the function we want to optimize
def f(w):
    # here we would normally:
    # ... 1) create a neural network with weights w
    # ... 2) run the neural network on the environment for some time
    # ... 3) sum up and return the total reward

    # but for the purposes of an example, lets try to minimize
    # the L2 distance to a specific solution vector. So the highest reward
    # we can achieve is 0, when the vector w is exactly equal to solution
    total = 0
    for i in range(50):
        observation, reward, done, info = env.step(w)
        total += reward
        if (done == True or reward == -100 or reward == 100):
            break
    observation = env.reset()
    return (total, i)

# hyperparameters
npop = 50 # population size
sigma = 0.1 # noise standard deviation
alpha = 0.05 # learning rate

# start the optimization
w = -2 *  np.random.random_sample((4)) + 1 # our initial guess is random

for i in range(100):
    # print current fitness of the most likely parameter setting
    if i % 10 == 0:
        print('iter %d. w: %s, reward: %f' % (i, str(w), f(w)[0]))

    # initialize memory for a population of w's, and their rewards
    N = -2 *  np.random.random_sample((npop, 4)) + 1 # samples from a normal distribution N(0,1)
    R = np.zeros(npop)
    steps = 0
    for j in range(npop):
        w_try = w + sigma*N[j] # jitter w using gaussian of sigma 0.1
        R[j], s = f(w_try) # evaluate the jittered version
        steps += s
    print('steps: %.1f' % (steps / npop))
    # standardize the rewards to have a gaussian distribution
    A = (R - np.mean(R)) / np.std(R)
    # perform the parameter update. The matrix multiply below
    # is just an efficient way to sum up all the rows of the noise matrix N,
    # where each row N[j] is weighted by A[j]
    w = w + alpha / (npop*sigma) * np.dot(N.T, A)
print('last w: %s, reward: %f' % (str(w), f(w)))

iter 0. w: [-0.08658217 -0.45790146  0.96623711  0.33932239], reward: -1.515477
steps: 49.0
steps: 49.0
steps: 49.0
steps: 49.0
steps: 49.0
steps: 49.0
steps: 49.0
steps: 49.0
steps: 49.0
steps: 49.0
iter 10. w: [-0.09231992 -0.50948863  0.80881123  0.4555401 ], reward: -1.800951
steps: 49.0
steps: 49.0
steps: 49.0
steps: 49.0
steps: 49.0
steps: 49.0
steps: 49.0
steps: 49.0
steps: 49.0
steps: 49.0
iter 20. w: [-0.04884302  0.06136911  0.71191172  0.2619075 ], reward: -8.494644
steps: 49.0
steps: 49.0
steps: 49.0
steps: 49.0
steps: 49.0
steps: 49.0
steps: 49.0
steps: 49.0
steps: 49.0
steps: 49.0
iter 30. w: [-0.19838916  0.36954634  0.66028055 -0.16729047], reward: 1.202544
steps: 49.0
steps: 49.0
steps: 49.0
steps: 49.0
steps: 49.0
steps: 49.0
steps: 49.0
steps: 49.0
steps: 49.0
steps: 49.0
iter 40. w: [-0.09160314  0.29904268  0.62227441 -0.19779349], reward: 0.512390
steps: 49.0
steps: 49.0
steps: 49.0
steps: 49.0
steps: 49.0
steps: 49.0
steps: 49.0
steps: 49.0
steps: 49.0
steps: 49.

TypeError: must be real number, not tuple

In [6]:
observation = env.reset()
for step in range(100):
    env.render()
    new_observation, reward, done, info = env.step(w)
    if (step % 10 == 0):
        print (step, reward)
    if (done == True):
        break 
env.close()

0 -0.08929724868523589
10 0.043469907391694464
20 -0.037381649341556154
30 0.1283756174630198
40 0.006871632907417686
50 -5.206001438093268e-05
60 -0.005166918363544075
70 -0.008101279861106394
80 -0.008977915174140452
90 -0.0093849354250943
