In [32]:
import numpy as np
import cPickle as pickle
import gym
import matplotlib.pyplot as plt

H = 10
learning_rate = 2e-3
gamma = 0.99
decay_rate = 0.99
score_queue_size = 100
resume = False
D = 3

In [15]:
if resume:  model = pickle.load(open('save.p', 'rb'))
else:
    model = {}
    model['W1'] = np.random.randn(H,D) / np.sqrt(D)
    model['W2'] = np.random.randn(H) / np.sqrt(H)

grad_buffer = { k : np.zeros_like(v) for k,v in model.iteritems() }
rmsprop_cache = { k : np.zeros_like(v) for k,v in model.iteritems() }


In [16]:
def sigmoid(x): 
    return 1.0 / (1.0 + np.exp(-x))

In [17]:
def prepro(I):
    return I[1:]

In [18]:
def discount_rewards(r):
    discounted_r = np.zeros_like(r)
    running_add = 0
    for t in reversed(xrange(0, r.size)):
        running_add = running_add * gamma + r[t]
        discounted_r[t] = running_add
        
    return discounted_r

In [19]:
def policy_forward(x):
    h = np.dot(model['W1'], x)
    h = sigmoid(h)
    logp = np.dot(model['W2'], h)
    p = sigmoid(logp)
    return p, h

In [20]:
def policy_backward(eph, epdlogp, epx):
    global grad_buffer
    dW2 = np.dot(eph.T, epdlogp).ravel()
    dh = np.outer(epdlogp, model['W2'])
    eph_dot = eph*(1-eph)
    dW1 = dh * eph_dot
    dW1 = np.dot(dW1.T, epx)
        
    for k in model: grad_buffer[k] += {'W1':dW1, 'W2':dW2}[k]

In [43]:
env = gym.make('CartPole-v0')
#env.monitor.start('CartPole', force=True)
observation = env.reset()
reward_sum, episode_num = 0,0
xs,hs,dlogps,drs = [],[],[],[]
score_queue = []
rList = []

[2017-07-25 14:00:22,549] Making new env: CartPole-v0


In [45]:
while True:
    
    x = prepro(observation)
    
    act_prob, h = policy_forward(x)
    
    if np.mean(score_queue) > 180:
        action = 1 if 0.5 < act_prob else 0
    else:
        action = 1 if np.random.uniform() < act_prob else 0

    xs.append(x)
    hs.append(h)
    y = action
    dlogps.append(y - act_prob)
    
    observation, reward, done, info = env.step(action)
    reward_sum += reward
    
    drs.append(reward)
    
    if done:
        episode_num += 1
        
        if episode_num > score_queue_size:
            score_queue.append(reward_sum)
            score_queue.pop(0)
        else:
            score_queue.append(reward_sum)
        
        print "episode : " + str(episode_num) + ", reward : " + str(reward_sum) + ", reward_mean : " + str(np.mean(score_queue))
        rList.append(reward_sum)
        if np.mean(score_queue) >= 200:
            print "CartPole solved!!!!!"
            break
        
        epx = np.vstack(xs)
        eph = np.vstack(hs)
        epdlogp = np.vstack(dlogps)
        epr = np.vstack(drs)
        xs,hs,dlogps,drs = [],[],[],[]
        
        discounted_epr = discount_rewards(epr)
        discounted_epr -= np.mean(discounted_epr)
        discounted_epr /= np.std(discounted_epr)
        
        epdlogp *= discounted_epr
        
        policy_backward(eph,epdlogp,epx)
        for k,v in model.iteritems():
            g = grad_buffer[k] 
            rmsprop_cache[k] = decay_rate * rmsprop_cache[k] + (1 - decay_rate)*g**2
            model[k] += learning_rate * g / (np.sqrt(rmsprop_cache[k]) + 1e-5)
            grad_buffer[k] = np.zeros_like(v)
        
        if episode_num % 1000 == 0: pickle.dump(model, open('Cart.p', 'wb'))
        
        reward_sum = 0
        observation = env.reset()
        

[2017-07-25 14:00:47,066] You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.


episode : 2, reward : 138.0, reward_mean : 138.0
episode : 3, reward : 200.0, reward_mean : 158.666666667
episode : 4, reward : 140.0, reward_mean : 154.0
episode : 5, reward : 200.0, reward_mean : 163.2
episode : 6, reward : 200.0, reward_mean : 169.333333333
episode : 7, reward : 17.0, reward_mean : 147.571428571
episode : 8, reward : 81.0, reward_mean : 139.25
episode : 9, reward : 200.0, reward_mean : 146.0
episode : 10, reward : 97.0, reward_mean : 141.1
episode : 11, reward : 187.0, reward_mean : 145.272727273
episode : 12, reward : 200.0, reward_mean : 149.833333333
episode : 13, reward : 189.0, reward_mean : 152.846153846
episode : 14, reward : 200.0, reward_mean : 156.214285714
episode : 15, reward : 200.0, reward_mean : 159.133333333
episode : 16, reward : 200.0, reward_mean : 161.6875
episode : 17, reward : 128.0, reward_mean : 159.705882353
episode : 18, reward : 139.0, reward_mean : 158.555555556
episode : 19, reward : 200.0, reward_mean : 160.736842105
episode : 20, rewar

episode : 158, reward : 200.0, reward_mean : 188.49
episode : 159, reward : 200.0, reward_mean : 188.56
episode : 160, reward : 200.0, reward_mean : 188.56
episode : 161, reward : 200.0, reward_mean : 188.56
episode : 162, reward : 200.0, reward_mean : 188.56
episode : 163, reward : 200.0, reward_mean : 188.56
episode : 164, reward : 200.0, reward_mean : 188.56
episode : 165, reward : 200.0, reward_mean : 189.19
episode : 166, reward : 200.0, reward_mean : 189.19
episode : 167, reward : 200.0, reward_mean : 190.8
episode : 168, reward : 200.0, reward_mean : 190.8
episode : 169, reward : 200.0, reward_mean : 190.8
episode : 170, reward : 200.0, reward_mean : 190.8
episode : 171, reward : 200.0, reward_mean : 190.8
episode : 172, reward : 200.0, reward_mean : 190.8
episode : 173, reward : 200.0, reward_mean : 191.43
episode : 174, reward : 200.0, reward_mean : 191.43
episode : 175, reward : 200.0, reward_mean : 191.43
episode : 176, reward : 200.0, reward_mean : 191.43
episode : 177, rew

In [46]:
plt.bar(range(len(rList)), rList, color = "blue", width = 0.01)

plt.show()

TypeError: float() argument must be a string or a number