In [1]:
import gym
import numpy as np
rm='Pong-v4'

In [2]:
%matplotlib inline
from matplotlib import animation
import matplotlib.pyplot as plt
from IPython.display import display, HTML

def display_frames_as_gif(frames):
    plt.figure(figsize=(frames[0].shape[1] / 72.0, frames[0].shape[0] / 72.0), dpi = 144)
    patch = plt.imshow(frames[0])
    plt.axis('off')
    def animate(i):
        patch.set_data(frames[i])
    anim = animation.FuncAnimation(plt.gcf(), animate, frames = len(frames), interval=50)
    plt.close(anim._fig)
    display(HTML(anim.to_jshtml()))

In [3]:
from gym.wrappers import AtariPreprocessing
gym.new_step_api=True
env = gym.make(rm)
# model initialization
H = 200 # number of hidden layer neurons
D = 80 * 80 # input dimensionality: 80x80 grid
model = {}
model['W1'] = np.random.randn(H,D) / np.sqrt(D) # "Xavier" initialization
model['W2'] = np.random.randn(H) / np.sqrt(H)
# hyperparameters
batch_size = 10 # every how many episodes to do a param update?
learning_rate = 1e-3
#learning_rate = 1e-3
gamma = 0.99 # discount factor for reward
decay_rate = 0.99 # decay factor for RMSProp leaky sum of grad^2
grad_buffer = { k : np.zeros_like(v) for k,v in model.items() } # update buffers that add up gradients over a batch
rmsprop_cache = { k : np.zeros_like(v) for k,v in model.items() } # rmsprop memory

def sigmoid(x): 
  return 1.0 / (1.0 + np.exp(-x)) # sigmoid "squashing" function to interval [0,1]

def prepro(I):
  """ prepro 210x160x3 uint8 frame into 6400 (80x80) 1D float vector """
  I=np.asarray(I)
  I = I[35:195] # crop
  
  I = I[::2,::2,0] # downsample by factor of 2
  
  I[I == 144] = 0 # erase background (background type 1)
  
  I[I == 109] = 0 # erase background (background type 2)
  
  I[I != 0] = 1 # everything else (paddles, ball) just set to 1
  return I.astype(float).ravel()

def discount_rewards(r):
  """ take 1D float array of rewards and compute discounted reward """
  discounted_r = np.zeros_like(r)
  running_add = 0
  for t in reversed(range(0, r.size)):
    if r[t] != 0: running_add = 0 # reset the sum, since this was a game boundary (pong specific!)
    running_add = running_add * gamma + r[t]
    discounted_r[t] = running_add
  return discounted_r

def policy_forward(x):
  h = np.dot(model['W1'], x)
  h[h<0] = 0 # ReLU nonlinearity
  logp = np.dot(model['W2'], h)
  p = sigmoid(logp)
  return p, h # return probability of taking action 2, and hidden state

def policy_backward(epx, eph, epdlogp):
  """ backward pass. (eph is array of intermediate hidden states) """
  dW2 = np.dot(eph.T, epdlogp).ravel()
  dh = np.outer(epdlogp, model['W2'])
  dh[eph <= 0] = 0 # backpro prelu
  dW1 = np.dot(dh.T, epx)
  return {'W1':dW1, 'W2':dW2}

def model_step(model, observation, prev_x):
  # preprocess the observation, set input to network to be difference image
  cur_x = prepro(observation)
  x = cur_x - prev_x if prev_x is not None else np.zeros(D)
  prev_x = cur_x
  
  # forward the policy network and sample an action from the returned probability
  aprob, _ = policy_forward(x)
  action = 2 if aprob >= 0.5 else 3 # roll the dice!
  
  return action, prev_x

def play_game(env, model):
  observation = env.reset()

  frames = []
  cumulated_reward = 0

  prev_x = None # used in computing the difference frame

  for t in range(1000):
      frames.append(env.render(mode = 'rgb_array'))
      action, prev_x = model_step(model, observation, prev_x)
      observation, reward, done, info = env.step(action)
      cumulated_reward += reward
      if done:
          print("Episode finished after {} timesteps, accumulated reward = {}".format(t+1, cumulated_reward))
          break
  print("Episode finished without success, accumulated reward = {}".format(cumulated_reward))
  env.close()
  display_frames_as_gif(frames)

def train_model(env, model, total_episodes = 100):
  hist = []
  observation = env.reset()

  prev_x = None # used in computing the difference frame
  xs,hs,dlogps,drs = [],[],[],[]
  running_reward = None
  reward_sum = 0
  episode_number = 0

  while True:
    # preprocess the observation, set input to network to be difference image
    cur_x = prepro(observation)
    x = cur_x - prev_x if prev_x is not None else np.zeros(D)
    prev_x = cur_x

    # forward the policy network and sample an action from the returned probability
    aprob, h = policy_forward(x)
    action = 2 if np.random.uniform() < aprob else 3 # roll the dice!

    # record various intermediates (needed later for backprop)
    xs.append(x) # observation
    hs.append(h) # hidden state
    y = 1 if action == 2 else 0 # a "fake label"
    dlogps.append(y - aprob) # grad that encourages the action that was taken to be taken (see http://cs231n.github.io/neural-networks-2/#losses if confused)

    # step the environment and get new measurements
    observation, reward, done, info = env.step(action)
    reward_sum += reward

    drs.append(reward) # record reward (has to be done after we call step() to get reward for previous action)

    if done: # an episode finished
      episode_number += 1

      # stack together all inputs, hidden states, action gradients, and rewards for this episode
      epx = np.vstack(xs)
      eph = np.vstack(hs)
      epdlogp = np.vstack(dlogps)
      epr = np.vstack(drs)
      xs,hs,dlogps,drs = [],[],[],[] # reset array memory

      # compute the discounted reward backwards through time
      discounted_epr = discount_rewards(epr)
      # standardize the rewards to be unit normal (helps control the gradient estimator variance)
      discounted_epr -= np.mean(discounted_epr)
      discounted_epr /= np.std(discounted_epr)

      epdlogp *= discounted_epr # modulate the gradient with advantage (PG magic happens right here.)
      grad = policy_backward(epx, eph, epdlogp)
      for k in model: grad_buffer[k] += grad[k] # accumulate grad over batch

      # perform rmsprop parameter update every batch_size episodes
      if episode_number % batch_size == 0:
        for k,v in model.items():
          g = grad_buffer[k] # gradient
          rmsprop_cache[k] = decay_rate * rmsprop_cache[k] + (1 - decay_rate) * g**2
          model[k] += learning_rate * g / (np.sqrt(rmsprop_cache[k]) + 1e-5)
          grad_buffer[k] = np.zeros_like(v) # reset batch gradient buffer

      # boring book-keeping
      running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
      hist.append((episode_number, reward_sum, running_reward))
      print ('episode %f, reward total was %f. running mean: %f' % (episode_number, reward_sum, running_reward))
      reward_sum = 0
      observation = env.reset() # reset env
      prev_x = None
      if episode_number == total_episodes: 
        return hist

    # if reward != 0: # Pong has either +1 or -1 reward exactly when game ends.
    #   print('ep {}: game finished, reward: {}'.format(episode_number, reward) + ('' if reward == -1 else ' !!!!!!!!'))

  deprecation(
  deprecation(


In [4]:
env.action_space

Discrete(6)

In [5]:
env.unwrapped.get_action_meanings()

['NOOP', 'FIRE', 'RIGHT', 'LEFT', 'RIGHTFIRE', 'LEFTFIRE']

In [6]:
%time hist1 = train_model(env, model, total_episodes=6000)

  logger.deprecation(


episode 1.000000, reward total was -21.000000. running mean: -21.000000
episode 2.000000, reward total was -21.000000. running mean: -21.000000
episode 3.000000, reward total was -20.000000. running mean: -20.990000
episode 4.000000, reward total was -19.000000. running mean: -20.970100
episode 5.000000, reward total was -21.000000. running mean: -20.970399
episode 6.000000, reward total was -20.000000. running mean: -20.960695
episode 7.000000, reward total was -21.000000. running mean: -20.961088
episode 8.000000, reward total was -21.000000. running mean: -20.961477
episode 9.000000, reward total was -21.000000. running mean: -20.961862
episode 10.000000, reward total was -21.000000. running mean: -20.962244
episode 11.000000, reward total was -19.000000. running mean: -20.942621
episode 12.000000, reward total was -20.000000. running mean: -20.933195
episode 13.000000, reward total was -21.000000. running mean: -20.933863
episode 14.000000, reward total was -21.000000. running mean

episode 114.000000, reward total was -21.000000. running mean: -20.425627
episode 115.000000, reward total was -21.000000. running mean: -20.431370
episode 116.000000, reward total was -19.000000. running mean: -20.417057
episode 117.000000, reward total was -20.000000. running mean: -20.412886
episode 118.000000, reward total was -20.000000. running mean: -20.408757
episode 119.000000, reward total was -21.000000. running mean: -20.414670
episode 120.000000, reward total was -21.000000. running mean: -20.420523
episode 121.000000, reward total was -19.000000. running mean: -20.406318
episode 122.000000, reward total was -19.000000. running mean: -20.392255
episode 123.000000, reward total was -19.000000. running mean: -20.378332
episode 124.000000, reward total was -21.000000. running mean: -20.384549
episode 125.000000, reward total was -21.000000. running mean: -20.390703
episode 126.000000, reward total was -18.000000. running mean: -20.366796
episode 127.000000, reward total was -

episode 225.000000, reward total was -19.000000. running mean: -20.124722
episode 226.000000, reward total was -21.000000. running mean: -20.133475
episode 227.000000, reward total was -17.000000. running mean: -20.102140
episode 228.000000, reward total was -18.000000. running mean: -20.081119
episode 229.000000, reward total was -20.000000. running mean: -20.080308
episode 230.000000, reward total was -19.000000. running mean: -20.069505
episode 231.000000, reward total was -20.000000. running mean: -20.068810
episode 232.000000, reward total was -20.000000. running mean: -20.068122
episode 233.000000, reward total was -20.000000. running mean: -20.067440
episode 234.000000, reward total was -20.000000. running mean: -20.066766
episode 235.000000, reward total was -19.000000. running mean: -20.056098
episode 236.000000, reward total was -21.000000. running mean: -20.065537
episode 237.000000, reward total was -18.000000. running mean: -20.044882
episode 238.000000, reward total was -

episode 336.000000, reward total was -20.000000. running mean: -19.561508
episode 337.000000, reward total was -15.000000. running mean: -19.515893
episode 338.000000, reward total was -20.000000. running mean: -19.520734
episode 339.000000, reward total was -17.000000. running mean: -19.495527
episode 340.000000, reward total was -17.000000. running mean: -19.470572
episode 341.000000, reward total was -18.000000. running mean: -19.455866
episode 342.000000, reward total was -18.000000. running mean: -19.441307
episode 343.000000, reward total was -21.000000. running mean: -19.456894
episode 344.000000, reward total was -21.000000. running mean: -19.472325
episode 345.000000, reward total was -19.000000. running mean: -19.467602
episode 346.000000, reward total was -18.000000. running mean: -19.452926
episode 347.000000, reward total was -19.000000. running mean: -19.448397
episode 348.000000, reward total was -21.000000. running mean: -19.463913
episode 349.000000, reward total was -

episode 447.000000, reward total was -21.000000. running mean: -18.982109
episode 448.000000, reward total was -19.000000. running mean: -18.982288
episode 449.000000, reward total was -20.000000. running mean: -18.992465
episode 450.000000, reward total was -21.000000. running mean: -19.012541
episode 451.000000, reward total was -19.000000. running mean: -19.012415
episode 452.000000, reward total was -18.000000. running mean: -19.002291
episode 453.000000, reward total was -17.000000. running mean: -18.982268
episode 454.000000, reward total was -16.000000. running mean: -18.952445
episode 455.000000, reward total was -20.000000. running mean: -18.962921
episode 456.000000, reward total was -20.000000. running mean: -18.973292
episode 457.000000, reward total was -19.000000. running mean: -18.973559
episode 458.000000, reward total was -20.000000. running mean: -18.983823
episode 459.000000, reward total was -18.000000. running mean: -18.973985
episode 460.000000, reward total was -

episode 558.000000, reward total was -16.000000. running mean: -18.064710
episode 559.000000, reward total was -19.000000. running mean: -18.074063
episode 560.000000, reward total was -17.000000. running mean: -18.063322
episode 561.000000, reward total was -16.000000. running mean: -18.042689
episode 562.000000, reward total was -17.000000. running mean: -18.032262
episode 563.000000, reward total was -17.000000. running mean: -18.021940
episode 564.000000, reward total was -17.000000. running mean: -18.011720
episode 565.000000, reward total was -17.000000. running mean: -18.001603
episode 566.000000, reward total was -16.000000. running mean: -17.981587
episode 567.000000, reward total was -16.000000. running mean: -17.961771
episode 568.000000, reward total was -20.000000. running mean: -17.982154
episode 569.000000, reward total was -18.000000. running mean: -17.982332
episode 570.000000, reward total was -15.000000. running mean: -17.952509
episode 571.000000, reward total was -

episode 669.000000, reward total was -17.000000. running mean: -17.240358
episode 670.000000, reward total was -19.000000. running mean: -17.257954
episode 671.000000, reward total was -11.000000. running mean: -17.195375
episode 672.000000, reward total was -19.000000. running mean: -17.213421
episode 673.000000, reward total was -10.000000. running mean: -17.141287
episode 674.000000, reward total was -16.000000. running mean: -17.129874
episode 675.000000, reward total was -16.000000. running mean: -17.118575
episode 676.000000, reward total was -12.000000. running mean: -17.067389
episode 677.000000, reward total was -19.000000. running mean: -17.086716
episode 678.000000, reward total was -19.000000. running mean: -17.105848
episode 679.000000, reward total was -16.000000. running mean: -17.094790
episode 680.000000, reward total was -21.000000. running mean: -17.133842
episode 681.000000, reward total was -15.000000. running mean: -17.112504
episode 682.000000, reward total was -

episode 780.000000, reward total was -11.000000. running mean: -16.440477
episode 781.000000, reward total was -19.000000. running mean: -16.466072
episode 782.000000, reward total was -11.000000. running mean: -16.411411
episode 783.000000, reward total was -16.000000. running mean: -16.407297
episode 784.000000, reward total was -17.000000. running mean: -16.413224
episode 785.000000, reward total was -19.000000. running mean: -16.439092
episode 786.000000, reward total was -15.000000. running mean: -16.424701
episode 787.000000, reward total was -11.000000. running mean: -16.370454
episode 788.000000, reward total was -15.000000. running mean: -16.356750
episode 789.000000, reward total was -17.000000. running mean: -16.363182
episode 790.000000, reward total was -12.000000. running mean: -16.319550
episode 791.000000, reward total was -17.000000. running mean: -16.326355
episode 792.000000, reward total was -10.000000. running mean: -16.263091
episode 793.000000, reward total was -

episode 891.000000, reward total was -13.000000. running mean: -15.480791
episode 892.000000, reward total was -15.000000. running mean: -15.475983
episode 893.000000, reward total was -12.000000. running mean: -15.441223
episode 894.000000, reward total was -12.000000. running mean: -15.406811
episode 895.000000, reward total was -7.000000. running mean: -15.322743
episode 896.000000, reward total was -18.000000. running mean: -15.349515
episode 897.000000, reward total was -10.000000. running mean: -15.296020
episode 898.000000, reward total was -14.000000. running mean: -15.283060
episode 899.000000, reward total was -19.000000. running mean: -15.320229
episode 900.000000, reward total was -14.000000. running mean: -15.307027
episode 901.000000, reward total was -13.000000. running mean: -15.283957
episode 902.000000, reward total was -19.000000. running mean: -15.321117
episode 903.000000, reward total was -17.000000. running mean: -15.337906
episode 904.000000, reward total was -1

episode 1002.000000, reward total was -12.000000. running mean: -14.373283
episode 1003.000000, reward total was -9.000000. running mean: -14.319550
episode 1004.000000, reward total was -7.000000. running mean: -14.246355
episode 1005.000000, reward total was -11.000000. running mean: -14.213891
episode 1006.000000, reward total was -9.000000. running mean: -14.161752
episode 1007.000000, reward total was -19.000000. running mean: -14.210135
episode 1008.000000, reward total was -11.000000. running mean: -14.178033
episode 1009.000000, reward total was -7.000000. running mean: -14.106253
episode 1010.000000, reward total was -13.000000. running mean: -14.095191
episode 1011.000000, reward total was -17.000000. running mean: -14.124239
episode 1012.000000, reward total was -10.000000. running mean: -14.082996
episode 1013.000000, reward total was -16.000000. running mean: -14.102166
episode 1014.000000, reward total was -7.000000. running mean: -14.031145
episode 1015.000000, reward to

episode 1112.000000, reward total was -13.000000. running mean: -13.572400
episode 1113.000000, reward total was -11.000000. running mean: -13.546676
episode 1114.000000, reward total was -10.000000. running mean: -13.511209
episode 1115.000000, reward total was -12.000000. running mean: -13.496097
episode 1116.000000, reward total was -9.000000. running mean: -13.451136
episode 1117.000000, reward total was -18.000000. running mean: -13.496625
episode 1118.000000, reward total was -16.000000. running mean: -13.521658
episode 1119.000000, reward total was -8.000000. running mean: -13.466442
episode 1120.000000, reward total was -16.000000. running mean: -13.491777
episode 1121.000000, reward total was -2.000000. running mean: -13.376860
episode 1122.000000, reward total was -15.000000. running mean: -13.393091
episode 1123.000000, reward total was -8.000000. running mean: -13.339160
episode 1124.000000, reward total was -8.000000. running mean: -13.285768
episode 1125.000000, reward to

episode 1222.000000, reward total was -12.000000. running mean: -12.555646
episode 1223.000000, reward total was -13.000000. running mean: -12.560089
episode 1224.000000, reward total was -16.000000. running mean: -12.594489
episode 1225.000000, reward total was -11.000000. running mean: -12.578544
episode 1226.000000, reward total was -14.000000. running mean: -12.592758
episode 1227.000000, reward total was -8.000000. running mean: -12.546831
episode 1228.000000, reward total was -3.000000. running mean: -12.451362
episode 1229.000000, reward total was -13.000000. running mean: -12.456849
episode 1230.000000, reward total was -13.000000. running mean: -12.462280
episode 1231.000000, reward total was -10.000000. running mean: -12.437657
episode 1232.000000, reward total was -12.000000. running mean: -12.433281
episode 1233.000000, reward total was -4.000000. running mean: -12.348948
episode 1234.000000, reward total was -9.000000. running mean: -12.315459
episode 1235.000000, reward t

episode 1332.000000, reward total was -12.000000. running mean: -11.836748
episode 1333.000000, reward total was -11.000000. running mean: -11.828380
episode 1334.000000, reward total was -13.000000. running mean: -11.840096
episode 1335.000000, reward total was -2.000000. running mean: -11.741695
episode 1336.000000, reward total was -10.000000. running mean: -11.724278
episode 1337.000000, reward total was -9.000000. running mean: -11.697036
episode 1338.000000, reward total was -10.000000. running mean: -11.680065
episode 1339.000000, reward total was -14.000000. running mean: -11.703265
episode 1340.000000, reward total was -15.000000. running mean: -11.736232
episode 1341.000000, reward total was -11.000000. running mean: -11.728870
episode 1342.000000, reward total was -12.000000. running mean: -11.731581
episode 1343.000000, reward total was -5.000000. running mean: -11.664265
episode 1344.000000, reward total was -6.000000. running mean: -11.607622
episode 1345.000000, reward t

episode 1442.000000, reward total was -9.000000. running mean: -10.850488
episode 1443.000000, reward total was -11.000000. running mean: -10.851983
episode 1444.000000, reward total was -13.000000. running mean: -10.873463
episode 1445.000000, reward total was -15.000000. running mean: -10.914728
episode 1446.000000, reward total was -11.000000. running mean: -10.915581
episode 1447.000000, reward total was -10.000000. running mean: -10.906425
episode 1448.000000, reward total was -9.000000. running mean: -10.887361
episode 1449.000000, reward total was -12.000000. running mean: -10.898487
episode 1450.000000, reward total was -7.000000. running mean: -10.859503
episode 1451.000000, reward total was -8.000000. running mean: -10.830908
episode 1452.000000, reward total was -11.000000. running mean: -10.832598
episode 1453.000000, reward total was -6.000000. running mean: -10.784272
episode 1454.000000, reward total was -7.000000. running mean: -10.746430
episode 1455.000000, reward tot

episode 1553.000000, reward total was -4.000000. running mean: -9.319462
episode 1554.000000, reward total was -12.000000. running mean: -9.346267
episode 1555.000000, reward total was -5.000000. running mean: -9.302805
episode 1556.000000, reward total was -10.000000. running mean: -9.309777
episode 1557.000000, reward total was -11.000000. running mean: -9.326679
episode 1558.000000, reward total was -8.000000. running mean: -9.313412
episode 1559.000000, reward total was -7.000000. running mean: -9.290278
episode 1560.000000, reward total was -12.000000. running mean: -9.317375
episode 1561.000000, reward total was -7.000000. running mean: -9.294201
episode 1562.000000, reward total was 5.000000. running mean: -9.151259
episode 1563.000000, reward total was -8.000000. running mean: -9.139747
episode 1564.000000, reward total was -8.000000. running mean: -9.128349
episode 1565.000000, reward total was -13.000000. running mean: -9.167066
episode 1566.000000, reward total was -3.000000

episode 1665.000000, reward total was -16.000000. running mean: -8.374219
episode 1666.000000, reward total was -15.000000. running mean: -8.440477
episode 1667.000000, reward total was -13.000000. running mean: -8.486072
episode 1668.000000, reward total was -8.000000. running mean: -8.481211
episode 1669.000000, reward total was -6.000000. running mean: -8.456399
episode 1670.000000, reward total was -1.000000. running mean: -8.381835
episode 1671.000000, reward total was -11.000000. running mean: -8.408017
episode 1672.000000, reward total was -17.000000. running mean: -8.493937
episode 1673.000000, reward total was -9.000000. running mean: -8.498997
episode 1674.000000, reward total was -3.000000. running mean: -8.444007
episode 1675.000000, reward total was -9.000000. running mean: -8.449567
episode 1676.000000, reward total was -11.000000. running mean: -8.475072
episode 1677.000000, reward total was -9.000000. running mean: -8.480321
episode 1678.000000, reward total was -7.0000

episode 1777.000000, reward total was 6.000000. running mean: -7.813822
episode 1778.000000, reward total was -6.000000. running mean: -7.795684
episode 1779.000000, reward total was -9.000000. running mean: -7.807727
episode 1780.000000, reward total was -10.000000. running mean: -7.829650
episode 1781.000000, reward total was -7.000000. running mean: -7.821353
episode 1782.000000, reward total was -11.000000. running mean: -7.853140
episode 1783.000000, reward total was -6.000000. running mean: -7.834608
episode 1784.000000, reward total was -8.000000. running mean: -7.836262
episode 1785.000000, reward total was -15.000000. running mean: -7.907900
episode 1786.000000, reward total was -5.000000. running mean: -7.878821
episode 1787.000000, reward total was -6.000000. running mean: -7.860032
episode 1788.000000, reward total was -7.000000. running mean: -7.851432
episode 1789.000000, reward total was -14.000000. running mean: -7.912918
episode 1790.000000, reward total was -3.000000.

episode 1889.000000, reward total was -1.000000. running mean: -6.622560
episode 1890.000000, reward total was -11.000000. running mean: -6.666334
episode 1891.000000, reward total was -3.000000. running mean: -6.629671
episode 1892.000000, reward total was -8.000000. running mean: -6.643374
episode 1893.000000, reward total was 2.000000. running mean: -6.556940
episode 1894.000000, reward total was -9.000000. running mean: -6.581371
episode 1895.000000, reward total was 8.000000. running mean: -6.435557
episode 1896.000000, reward total was -6.000000. running mean: -6.431202
episode 1897.000000, reward total was -9.000000. running mean: -6.456890
episode 1898.000000, reward total was 1.000000. running mean: -6.382321
episode 1899.000000, reward total was -3.000000. running mean: -6.348497
episode 1900.000000, reward total was -13.000000. running mean: -6.415012
episode 1901.000000, reward total was -10.000000. running mean: -6.450862
episode 1902.000000, reward total was -4.000000. ru

episode 2002.000000, reward total was -6.000000. running mean: -6.278076
episode 2003.000000, reward total was -5.000000. running mean: -6.265295
episode 2004.000000, reward total was -6.000000. running mean: -6.262642
episode 2005.000000, reward total was -7.000000. running mean: -6.270015
episode 2006.000000, reward total was -7.000000. running mean: -6.277315
episode 2007.000000, reward total was -1.000000. running mean: -6.224542
episode 2008.000000, reward total was -7.000000. running mean: -6.232297
episode 2009.000000, reward total was -3.000000. running mean: -6.199974
episode 2010.000000, reward total was -15.000000. running mean: -6.287974
episode 2011.000000, reward total was -9.000000. running mean: -6.315094
episode 2012.000000, reward total was -8.000000. running mean: -6.331943
episode 2013.000000, reward total was -6.000000. running mean: -6.328624
episode 2014.000000, reward total was 5.000000. running mean: -6.215338
episode 2015.000000, reward total was -6.000000. ru

episode 2115.000000, reward total was 6.000000. running mean: -5.356570
episode 2116.000000, reward total was -6.000000. running mean: -5.363004
episode 2117.000000, reward total was -12.000000. running mean: -5.429374
episode 2118.000000, reward total was 1.000000. running mean: -5.365080
episode 2119.000000, reward total was -7.000000. running mean: -5.381429
episode 2120.000000, reward total was -14.000000. running mean: -5.467615
episode 2121.000000, reward total was 5.000000. running mean: -5.362939
episode 2122.000000, reward total was -5.000000. running mean: -5.359310
episode 2123.000000, reward total was -15.000000. running mean: -5.455716
episode 2124.000000, reward total was -6.000000. running mean: -5.461159
episode 2125.000000, reward total was -2.000000. running mean: -5.426548
episode 2126.000000, reward total was -6.000000. running mean: -5.432282
episode 2127.000000, reward total was 4.000000. running mean: -5.337959
episode 2128.000000, reward total was -17.000000. ru

episode 2228.000000, reward total was 3.000000. running mean: -5.199513
episode 2229.000000, reward total was -8.000000. running mean: -5.227518
episode 2230.000000, reward total was -6.000000. running mean: -5.235243
episode 2231.000000, reward total was 2.000000. running mean: -5.162890
episode 2232.000000, reward total was -3.000000. running mean: -5.141262
episode 2233.000000, reward total was -8.000000. running mean: -5.169849
episode 2234.000000, reward total was -10.000000. running mean: -5.218150
episode 2235.000000, reward total was -10.000000. running mean: -5.265969
episode 2236.000000, reward total was -9.000000. running mean: -5.303309
episode 2237.000000, reward total was -9.000000. running mean: -5.340276
episode 2238.000000, reward total was -17.000000. running mean: -5.456873
episode 2239.000000, reward total was -13.000000. running mean: -5.532305
episode 2240.000000, reward total was 10.000000. running mean: -5.376982
episode 2241.000000, reward total was -3.000000. 

episode 2341.000000, reward total was -11.000000. running mean: -4.821690
episode 2342.000000, reward total was -2.000000. running mean: -4.793473
episode 2343.000000, reward total was -11.000000. running mean: -4.855538
episode 2344.000000, reward total was 2.000000. running mean: -4.786983
episode 2345.000000, reward total was -13.000000. running mean: -4.869113
episode 2346.000000, reward total was -7.000000. running mean: -4.890422
episode 2347.000000, reward total was -6.000000. running mean: -4.901518
episode 2348.000000, reward total was 1.000000. running mean: -4.842503
episode 2349.000000, reward total was 1.000000. running mean: -4.784078
episode 2350.000000, reward total was -2.000000. running mean: -4.756237
episode 2351.000000, reward total was -7.000000. running mean: -4.778674
episode 2352.000000, reward total was 7.000000. running mean: -4.660888
episode 2353.000000, reward total was -3.000000. running mean: -4.644279
episode 2354.000000, reward total was -2.000000. run

episode 2454.000000, reward total was 4.000000. running mean: -4.564271
episode 2455.000000, reward total was -9.000000. running mean: -4.608629
episode 2456.000000, reward total was -9.000000. running mean: -4.652542
episode 2457.000000, reward total was -13.000000. running mean: -4.736017
episode 2458.000000, reward total was -4.000000. running mean: -4.728657
episode 2459.000000, reward total was -5.000000. running mean: -4.731370
episode 2460.000000, reward total was -7.000000. running mean: -4.754057
episode 2461.000000, reward total was -5.000000. running mean: -4.756516
episode 2462.000000, reward total was -9.000000. running mean: -4.798951
episode 2463.000000, reward total was -9.000000. running mean: -4.840961
episode 2464.000000, reward total was -3.000000. running mean: -4.822552
episode 2465.000000, reward total was -3.000000. running mean: -4.804326
episode 2466.000000, reward total was -7.000000. running mean: -4.826283
episode 2467.000000, reward total was -9.000000. ru

episode 2567.000000, reward total was -9.000000. running mean: -5.010346
episode 2568.000000, reward total was -5.000000. running mean: -5.010243
episode 2569.000000, reward total was -5.000000. running mean: -5.010140
episode 2570.000000, reward total was -11.000000. running mean: -5.070039
episode 2571.000000, reward total was -8.000000. running mean: -5.099338
episode 2572.000000, reward total was -8.000000. running mean: -5.128345
episode 2573.000000, reward total was -3.000000. running mean: -5.107062
episode 2574.000000, reward total was -6.000000. running mean: -5.115991
episode 2575.000000, reward total was -1.000000. running mean: -5.074831
episode 2576.000000, reward total was 4.000000. running mean: -4.984083
episode 2577.000000, reward total was -1.000000. running mean: -4.944242
episode 2578.000000, reward total was -5.000000. running mean: -4.944799
episode 2579.000000, reward total was 5.000000. running mean: -4.845351
episode 2580.000000, reward total was -4.000000. run

episode 2680.000000, reward total was 5.000000. running mean: -4.681978
episode 2681.000000, reward total was -9.000000. running mean: -4.725159
episode 2682.000000, reward total was -12.000000. running mean: -4.797907
episode 2683.000000, reward total was -1.000000. running mean: -4.759928
episode 2684.000000, reward total was 3.000000. running mean: -4.682329
episode 2685.000000, reward total was 6.000000. running mean: -4.575505
episode 2686.000000, reward total was -7.000000. running mean: -4.599750
episode 2687.000000, reward total was -6.000000. running mean: -4.613753
episode 2688.000000, reward total was 3.000000. running mean: -4.537615
episode 2689.000000, reward total was 4.000000. running mean: -4.452239
episode 2690.000000, reward total was -11.000000. running mean: -4.517717
episode 2691.000000, reward total was -2.000000. running mean: -4.492540
episode 2692.000000, reward total was 2.000000. running mean: -4.427614
episode 2693.000000, reward total was -5.000000. runnin

episode 2793.000000, reward total was -1.000000. running mean: -3.650683
episode 2794.000000, reward total was 9.000000. running mean: -3.524176
episode 2795.000000, reward total was -9.000000. running mean: -3.578934
episode 2796.000000, reward total was 3.000000. running mean: -3.513145
episode 2797.000000, reward total was -2.000000. running mean: -3.498013
episode 2798.000000, reward total was -9.000000. running mean: -3.553033
episode 2799.000000, reward total was -6.000000. running mean: -3.577503
episode 2800.000000, reward total was -3.000000. running mean: -3.571728
episode 2801.000000, reward total was -7.000000. running mean: -3.606011
episode 2802.000000, reward total was 5.000000. running mean: -3.519951
episode 2803.000000, reward total was 6.000000. running mean: -3.424751
episode 2804.000000, reward total was 1.000000. running mean: -3.380503
episode 2805.000000, reward total was -6.000000. running mean: -3.406698
episode 2806.000000, reward total was 1.000000. running 

episode 2906.000000, reward total was -2.000000. running mean: -2.987439
episode 2907.000000, reward total was -3.000000. running mean: -2.987564
episode 2908.000000, reward total was -7.000000. running mean: -3.027689
episode 2909.000000, reward total was 2.000000. running mean: -2.977412
episode 2910.000000, reward total was -1.000000. running mean: -2.957638
episode 2911.000000, reward total was -8.000000. running mean: -3.008061
episode 2912.000000, reward total was -3.000000. running mean: -3.007981
episode 2913.000000, reward total was 3.000000. running mean: -2.947901
episode 2914.000000, reward total was -3.000000. running mean: -2.948422
episode 2915.000000, reward total was -8.000000. running mean: -2.998938
episode 2916.000000, reward total was -7.000000. running mean: -3.038948
episode 2917.000000, reward total was -3.000000. running mean: -3.038559
episode 2918.000000, reward total was 10.000000. running mean: -2.908173
episode 2919.000000, reward total was 8.000000. runni

episode 3019.000000, reward total was -6.000000. running mean: -3.211519
episode 3020.000000, reward total was -7.000000. running mean: -3.249404
episode 3021.000000, reward total was 1.000000. running mean: -3.206910
episode 3022.000000, reward total was -9.000000. running mean: -3.264841
episode 3023.000000, reward total was -5.000000. running mean: -3.282193
episode 3024.000000, reward total was -8.000000. running mean: -3.329371
episode 3025.000000, reward total was 6.000000. running mean: -3.236077
episode 3026.000000, reward total was -7.000000. running mean: -3.273716
episode 3027.000000, reward total was -4.000000. running mean: -3.280979
episode 3028.000000, reward total was -5.000000. running mean: -3.298169
episode 3029.000000, reward total was -11.000000. running mean: -3.375188
episode 3030.000000, reward total was -12.000000. running mean: -3.461436
episode 3031.000000, reward total was 7.000000. running mean: -3.356821
episode 3032.000000, reward total was -9.000000. run

episode 3132.000000, reward total was -1.000000. running mean: -3.388707
episode 3133.000000, reward total was -11.000000. running mean: -3.464820
episode 3134.000000, reward total was 3.000000. running mean: -3.400171
episode 3135.000000, reward total was -1.000000. running mean: -3.376170
episode 3136.000000, reward total was 4.000000. running mean: -3.302408
episode 3137.000000, reward total was -10.000000. running mean: -3.369384
episode 3138.000000, reward total was -11.000000. running mean: -3.445690
episode 3139.000000, reward total was -9.000000. running mean: -3.501233
episode 3140.000000, reward total was -1.000000. running mean: -3.476221
episode 3141.000000, reward total was 3.000000. running mean: -3.411459
episode 3142.000000, reward total was -11.000000. running mean: -3.487344
episode 3143.000000, reward total was -3.000000. running mean: -3.482471
episode 3144.000000, reward total was -7.000000. running mean: -3.517646
episode 3145.000000, reward total was 7.000000. ru

episode 3245.000000, reward total was -15.000000. running mean: -3.050170
episode 3246.000000, reward total was 4.000000. running mean: -2.979668
episode 3247.000000, reward total was -6.000000. running mean: -3.009871
episode 3248.000000, reward total was 9.000000. running mean: -2.889773
episode 3249.000000, reward total was -9.000000. running mean: -2.950875
episode 3250.000000, reward total was -1.000000. running mean: -2.931366
episode 3251.000000, reward total was 1.000000. running mean: -2.892052
episode 3252.000000, reward total was -3.000000. running mean: -2.893132
episode 3253.000000, reward total was 2.000000. running mean: -2.844201
episode 3254.000000, reward total was -5.000000. running mean: -2.865759
episode 3255.000000, reward total was 8.000000. running mean: -2.757101
episode 3256.000000, reward total was -1.000000. running mean: -2.739530
episode 3257.000000, reward total was -1.000000. running mean: -2.722135
episode 3258.000000, reward total was -9.000000. runnin

episode 3358.000000, reward total was -8.000000. running mean: -1.961143
episode 3359.000000, reward total was 6.000000. running mean: -1.881531
episode 3360.000000, reward total was -5.000000. running mean: -1.912716
episode 3361.000000, reward total was 7.000000. running mean: -1.823589
episode 3362.000000, reward total was -13.000000. running mean: -1.935353
episode 3363.000000, reward total was 2.000000. running mean: -1.895999
episode 3364.000000, reward total was 1.000000. running mean: -1.867039
episode 3365.000000, reward total was -1.000000. running mean: -1.858369
episode 3366.000000, reward total was -7.000000. running mean: -1.909785
episode 3367.000000, reward total was 1.000000. running mean: -1.880687
episode 3368.000000, reward total was -5.000000. running mean: -1.911880
episode 3369.000000, reward total was -9.000000. running mean: -1.982762
episode 3370.000000, reward total was 1.000000. running mean: -1.952934
episode 3371.000000, reward total was -6.000000. running

episode 3471.000000, reward total was 5.000000. running mean: -2.304877
episode 3472.000000, reward total was -13.000000. running mean: -2.411828
episode 3473.000000, reward total was -9.000000. running mean: -2.477710
episode 3474.000000, reward total was 3.000000. running mean: -2.422933
episode 3475.000000, reward total was -3.000000. running mean: -2.428703
episode 3476.000000, reward total was 10.000000. running mean: -2.304416
episode 3477.000000, reward total was -1.000000. running mean: -2.291372
episode 3478.000000, reward total was 10.000000. running mean: -2.168458
episode 3479.000000, reward total was -7.000000. running mean: -2.216774
episode 3480.000000, reward total was 2.000000. running mean: -2.174606
episode 3481.000000, reward total was 7.000000. running mean: -2.082860
episode 3482.000000, reward total was -1.000000. running mean: -2.072031
episode 3483.000000, reward total was -1.000000. running mean: -2.061311
episode 3484.000000, reward total was 1.000000. runnin

episode 3584.000000, reward total was 5.000000. running mean: -1.065916
episode 3585.000000, reward total was 1.000000. running mean: -1.045256
episode 3586.000000, reward total was -5.000000. running mean: -1.084804
episode 3587.000000, reward total was 9.000000. running mean: -0.983956
episode 3588.000000, reward total was -3.000000. running mean: -1.004116
episode 3589.000000, reward total was 1.000000. running mean: -0.984075
episode 3590.000000, reward total was 4.000000. running mean: -0.934234
episode 3591.000000, reward total was -4.000000. running mean: -0.964892
episode 3592.000000, reward total was -4.000000. running mean: -0.995243
episode 3593.000000, reward total was 1.000000. running mean: -0.975291
episode 3594.000000, reward total was 4.000000. running mean: -0.925538
episode 3595.000000, reward total was -4.000000. running mean: -0.956282
episode 3596.000000, reward total was 7.000000. running mean: -0.876719
episode 3597.000000, reward total was -2.000000. running me

episode 3697.000000, reward total was -2.000000. running mean: -1.800682
episode 3698.000000, reward total was -2.000000. running mean: -1.802675
episode 3699.000000, reward total was -4.000000. running mean: -1.824648
episode 3700.000000, reward total was -4.000000. running mean: -1.846402
episode 3701.000000, reward total was -3.000000. running mean: -1.857938
episode 3702.000000, reward total was -4.000000. running mean: -1.879358
episode 3703.000000, reward total was 3.000000. running mean: -1.830565
episode 3704.000000, reward total was -7.000000. running mean: -1.882259
episode 3705.000000, reward total was -3.000000. running mean: -1.893436
episode 3706.000000, reward total was -13.000000. running mean: -2.004502
episode 3707.000000, reward total was 3.000000. running mean: -1.954457
episode 3708.000000, reward total was -6.000000. running mean: -1.994912
episode 3709.000000, reward total was -1.000000. running mean: -1.984963
episode 3710.000000, reward total was -7.000000. run

episode 3810.000000, reward total was 5.000000. running mean: -0.448483
episode 3811.000000, reward total was -4.000000. running mean: -0.483998
episode 3812.000000, reward total was -9.000000. running mean: -0.569158
episode 3813.000000, reward total was 6.000000. running mean: -0.503466
episode 3814.000000, reward total was -1.000000. running mean: -0.508432
episode 3815.000000, reward total was 6.000000. running mean: -0.443347
episode 3816.000000, reward total was 6.000000. running mean: -0.378914
episode 3817.000000, reward total was -3.000000. running mean: -0.405125
episode 3818.000000, reward total was -7.000000. running mean: -0.471074
episode 3819.000000, reward total was 10.000000. running mean: -0.366363
episode 3820.000000, reward total was -1.000000. running mean: -0.372699
episode 3821.000000, reward total was 5.000000. running mean: -0.318972
episode 3822.000000, reward total was -4.000000. running mean: -0.355783
episode 3823.000000, reward total was -7.000000. running

episode 3923.000000, reward total was 2.000000. running mean: -1.067212
episode 3924.000000, reward total was 6.000000. running mean: -0.996539
episode 3925.000000, reward total was 6.000000. running mean: -0.926574
episode 3926.000000, reward total was -1.000000. running mean: -0.927308
episode 3927.000000, reward total was 6.000000. running mean: -0.858035
episode 3928.000000, reward total was -7.000000. running mean: -0.919455
episode 3929.000000, reward total was -5.000000. running mean: -0.960260
episode 3930.000000, reward total was 4.000000. running mean: -0.910658
episode 3931.000000, reward total was -7.000000. running mean: -0.971551
episode 3932.000000, reward total was -9.000000. running mean: -1.051836
episode 3933.000000, reward total was -2.000000. running mean: -1.061317
episode 3934.000000, reward total was -3.000000. running mean: -1.080704
episode 3935.000000, reward total was -5.000000. running mean: -1.119897
episode 3936.000000, reward total was -5.000000. running

episode 4036.000000, reward total was 1.000000. running mean: -0.422260
episode 4037.000000, reward total was 5.000000. running mean: -0.368037
episode 4038.000000, reward total was 9.000000. running mean: -0.274357
episode 4039.000000, reward total was 4.000000. running mean: -0.231613
episode 4040.000000, reward total was -1.000000. running mean: -0.239297
episode 4041.000000, reward total was -2.000000. running mean: -0.256904
episode 4042.000000, reward total was -1.000000. running mean: -0.264335
episode 4043.000000, reward total was -12.000000. running mean: -0.381692
episode 4044.000000, reward total was 9.000000. running mean: -0.287875
episode 4045.000000, reward total was 3.000000. running mean: -0.254996
episode 4046.000000, reward total was 5.000000. running mean: -0.202446
episode 4047.000000, reward total was -9.000000. running mean: -0.290422
episode 4048.000000, reward total was 4.000000. running mean: -0.247517
episode 4049.000000, reward total was -5.000000. running m

episode 4150.000000, reward total was -4.000000. running mean: 0.230469
episode 4151.000000, reward total was 11.000000. running mean: 0.338164
episode 4152.000000, reward total was 6.000000. running mean: 0.394783
episode 4153.000000, reward total was 4.000000. running mean: 0.430835
episode 4154.000000, reward total was 4.000000. running mean: 0.466527
episode 4155.000000, reward total was -7.000000. running mean: 0.391861
episode 4156.000000, reward total was -3.000000. running mean: 0.357943
episode 4157.000000, reward total was 4.000000. running mean: 0.394363
episode 4158.000000, reward total was 2.000000. running mean: 0.410420
episode 4159.000000, reward total was -5.000000. running mean: 0.356315
episode 4160.000000, reward total was -2.000000. running mean: 0.332752
episode 4161.000000, reward total was -3.000000. running mean: 0.299425
episode 4162.000000, reward total was 5.000000. running mean: 0.346431
episode 4163.000000, reward total was 7.000000. running mean: 0.412966

episode 4265.000000, reward total was 5.000000. running mean: 1.208544
episode 4266.000000, reward total was 8.000000. running mean: 1.276459
episode 4267.000000, reward total was -4.000000. running mean: 1.223694
episode 4268.000000, reward total was 10.000000. running mean: 1.311457
episode 4269.000000, reward total was -7.000000. running mean: 1.228342
episode 4270.000000, reward total was -5.000000. running mean: 1.166059
episode 4271.000000, reward total was -3.000000. running mean: 1.124398
episode 4272.000000, reward total was 6.000000. running mean: 1.173154
episode 4273.000000, reward total was -4.000000. running mean: 1.121423
episode 4274.000000, reward total was 4.000000. running mean: 1.150209
episode 4275.000000, reward total was 2.000000. running mean: 1.158707
episode 4276.000000, reward total was -1.000000. running mean: 1.137120
episode 4277.000000, reward total was 4.000000. running mean: 1.165748
episode 4278.000000, reward total was -3.000000. running mean: 1.12409

episode 4380.000000, reward total was 4.000000. running mean: 1.686209
episode 4381.000000, reward total was 3.000000. running mean: 1.699347
episode 4382.000000, reward total was -4.000000. running mean: 1.642354
episode 4383.000000, reward total was 1.000000. running mean: 1.635930
episode 4384.000000, reward total was -4.000000. running mean: 1.579571
episode 4385.000000, reward total was 3.000000. running mean: 1.593775
episode 4386.000000, reward total was 3.000000. running mean: 1.607838
episode 4387.000000, reward total was -4.000000. running mean: 1.551759
episode 4388.000000, reward total was -12.000000. running mean: 1.416242
episode 4389.000000, reward total was -1.000000. running mean: 1.392079
episode 4390.000000, reward total was 5.000000. running mean: 1.428158
episode 4391.000000, reward total was -4.000000. running mean: 1.373877
episode 4392.000000, reward total was -10.000000. running mean: 1.260138
episode 4393.000000, reward total was 7.000000. running mean: 1.3175

episode 4495.000000, reward total was 13.000000. running mean: 0.975484
episode 4496.000000, reward total was 4.000000. running mean: 1.005729
episode 4497.000000, reward total was 3.000000. running mean: 1.025671
episode 4498.000000, reward total was -3.000000. running mean: 0.985415
episode 4499.000000, reward total was -3.000000. running mean: 0.945561
episode 4500.000000, reward total was -3.000000. running mean: 0.906105
episode 4501.000000, reward total was -3.000000. running mean: 0.867044
episode 4502.000000, reward total was -5.000000. running mean: 0.808373
episode 4503.000000, reward total was 4.000000. running mean: 0.840290
episode 4504.000000, reward total was 6.000000. running mean: 0.891887
episode 4505.000000, reward total was 4.000000. running mean: 0.922968
episode 4506.000000, reward total was 6.000000. running mean: 0.973738
episode 4507.000000, reward total was 12.000000. running mean: 1.084001
episode 4508.000000, reward total was 8.000000. running mean: 1.153161

episode 4610.000000, reward total was 1.000000. running mean: 1.629771
episode 4611.000000, reward total was 3.000000. running mean: 1.643474
episode 4612.000000, reward total was 8.000000. running mean: 1.707039
episode 4613.000000, reward total was -1.000000. running mean: 1.679969
episode 4614.000000, reward total was 11.000000. running mean: 1.773169
episode 4615.000000, reward total was -9.000000. running mean: 1.665437
episode 4616.000000, reward total was -2.000000. running mean: 1.628783
episode 4617.000000, reward total was 4.000000. running mean: 1.652495
episode 4618.000000, reward total was -3.000000. running mean: 1.605970
episode 4619.000000, reward total was 4.000000. running mean: 1.629910
episode 4620.000000, reward total was -14.000000. running mean: 1.473611
episode 4621.000000, reward total was 12.000000. running mean: 1.578875
episode 4622.000000, reward total was 5.000000. running mean: 1.613086
episode 4623.000000, reward total was -6.000000. running mean: 1.5369

episode 4725.000000, reward total was 10.000000. running mean: 1.656650
episode 4726.000000, reward total was 4.000000. running mean: 1.680084
episode 4727.000000, reward total was 3.000000. running mean: 1.693283
episode 4728.000000, reward total was 2.000000. running mean: 1.696350
episode 4729.000000, reward total was -3.000000. running mean: 1.649387
episode 4730.000000, reward total was -10.000000. running mean: 1.532893
episode 4731.000000, reward total was 7.000000. running mean: 1.587564
episode 4732.000000, reward total was 3.000000. running mean: 1.601688
episode 4733.000000, reward total was -4.000000. running mean: 1.545671
episode 4734.000000, reward total was 1.000000. running mean: 1.540215
episode 4735.000000, reward total was 2.000000. running mean: 1.544812
episode 4736.000000, reward total was -9.000000. running mean: 1.439364
episode 4737.000000, reward total was -3.000000. running mean: 1.394971
episode 4738.000000, reward total was 12.000000. running mean: 1.50102

episode 4840.000000, reward total was 7.000000. running mean: 2.070844
episode 4841.000000, reward total was 3.000000. running mean: 2.080135
episode 4842.000000, reward total was -6.000000. running mean: 1.999334
episode 4843.000000, reward total was 4.000000. running mean: 2.019341
episode 4844.000000, reward total was -3.000000. running mean: 1.969147
episode 4845.000000, reward total was 9.000000. running mean: 2.039456
episode 4846.000000, reward total was 1.000000. running mean: 2.029061
episode 4847.000000, reward total was 8.000000. running mean: 2.088771
episode 4848.000000, reward total was 10.000000. running mean: 2.167883
episode 4849.000000, reward total was 14.000000. running mean: 2.286204
episode 4850.000000, reward total was 7.000000. running mean: 2.333342
episode 4851.000000, reward total was 8.000000. running mean: 2.390009
episode 4852.000000, reward total was -2.000000. running mean: 2.346109
episode 4853.000000, reward total was 3.000000. running mean: 2.352648
e

episode 4955.000000, reward total was 11.000000. running mean: 2.679270
episode 4956.000000, reward total was -2.000000. running mean: 2.632477
episode 4957.000000, reward total was 5.000000. running mean: 2.656152
episode 4958.000000, reward total was 1.000000. running mean: 2.639591
episode 4959.000000, reward total was 8.000000. running mean: 2.693195
episode 4960.000000, reward total was 5.000000. running mean: 2.716263
episode 4961.000000, reward total was 2.000000. running mean: 2.709100
episode 4962.000000, reward total was 5.000000. running mean: 2.732009
episode 4963.000000, reward total was 5.000000. running mean: 2.754689
episode 4964.000000, reward total was 4.000000. running mean: 2.767142
episode 4965.000000, reward total was -3.000000. running mean: 2.709471
episode 4966.000000, reward total was 7.000000. running mean: 2.752376
episode 4967.000000, reward total was 4.000000. running mean: 2.764852
episode 4968.000000, reward total was 10.000000. running mean: 2.837204
ep

episode 5070.000000, reward total was 4.000000. running mean: 2.725803
episode 5071.000000, reward total was 10.000000. running mean: 2.798545
episode 5072.000000, reward total was -1.000000. running mean: 2.760559
episode 5073.000000, reward total was 2.000000. running mean: 2.752954
episode 5074.000000, reward total was -1.000000. running mean: 2.715424
episode 5075.000000, reward total was 9.000000. running mean: 2.778270
episode 5076.000000, reward total was 1.000000. running mean: 2.760487
episode 5077.000000, reward total was 1.000000. running mean: 2.742882
episode 5078.000000, reward total was 3.000000. running mean: 2.745454
episode 5079.000000, reward total was 12.000000. running mean: 2.837999
episode 5080.000000, reward total was -7.000000. running mean: 2.739619
episode 5081.000000, reward total was 10.000000. running mean: 2.812223
episode 5082.000000, reward total was -4.000000. running mean: 2.744101
episode 5083.000000, reward total was 8.000000. running mean: 2.796660

episode 5185.000000, reward total was -1.000000. running mean: 3.849433
episode 5186.000000, reward total was 3.000000. running mean: 3.840939
episode 5187.000000, reward total was 1.000000. running mean: 3.812529
episode 5188.000000, reward total was 6.000000. running mean: 3.834404
episode 5189.000000, reward total was 12.000000. running mean: 3.916060
episode 5190.000000, reward total was 8.000000. running mean: 3.956899
episode 5191.000000, reward total was 6.000000. running mean: 3.977330
episode 5192.000000, reward total was 1.000000. running mean: 3.947557
episode 5193.000000, reward total was 5.000000. running mean: 3.958081
episode 5194.000000, reward total was -9.000000. running mean: 3.828501
episode 5195.000000, reward total was 1.000000. running mean: 3.800216
episode 5196.000000, reward total was 2.000000. running mean: 3.782213
episode 5197.000000, reward total was 4.000000. running mean: 3.784391
episode 5198.000000, reward total was 4.000000. running mean: 3.786547
epi

episode 5300.000000, reward total was -7.000000. running mean: 3.285059
episode 5301.000000, reward total was 1.000000. running mean: 3.262209
episode 5302.000000, reward total was 10.000000. running mean: 3.329587
episode 5303.000000, reward total was 8.000000. running mean: 3.376291
episode 5304.000000, reward total was 5.000000. running mean: 3.392528
episode 5305.000000, reward total was 8.000000. running mean: 3.438603
episode 5306.000000, reward total was 6.000000. running mean: 3.464217
episode 5307.000000, reward total was 6.000000. running mean: 3.489575
episode 5308.000000, reward total was -3.000000. running mean: 3.424679
episode 5309.000000, reward total was 1.000000. running mean: 3.400432
episode 5310.000000, reward total was -8.000000. running mean: 3.286428
episode 5311.000000, reward total was 5.000000. running mean: 3.303563
episode 5312.000000, reward total was -3.000000. running mean: 3.240528
episode 5313.000000, reward total was -4.000000. running mean: 3.168123


episode 5415.000000, reward total was -8.000000. running mean: 3.268432
episode 5416.000000, reward total was -11.000000. running mean: 3.125748
episode 5417.000000, reward total was -4.000000. running mean: 3.054490
episode 5418.000000, reward total was -2.000000. running mean: 3.003946
episode 5419.000000, reward total was 4.000000. running mean: 3.013906
episode 5420.000000, reward total was 10.000000. running mean: 3.083767
episode 5421.000000, reward total was -9.000000. running mean: 2.962929
episode 5422.000000, reward total was -15.000000. running mean: 2.783300
episode 5423.000000, reward total was 10.000000. running mean: 2.855467
episode 5424.000000, reward total was -7.000000. running mean: 2.756912
episode 5425.000000, reward total was 4.000000. running mean: 2.769343
episode 5426.000000, reward total was -3.000000. running mean: 2.711650
episode 5427.000000, reward total was -7.000000. running mean: 2.614533
episode 5428.000000, reward total was 5.000000. running mean: 2.

episode 5530.000000, reward total was 7.000000. running mean: 3.688934
episode 5531.000000, reward total was 7.000000. running mean: 3.722045
episode 5532.000000, reward total was 7.000000. running mean: 3.754825
episode 5533.000000, reward total was 2.000000. running mean: 3.737276
episode 5534.000000, reward total was 2.000000. running mean: 3.719904
episode 5535.000000, reward total was -3.000000. running mean: 3.652705
episode 5536.000000, reward total was 3.000000. running mean: 3.646178
episode 5537.000000, reward total was -9.000000. running mean: 3.519716
episode 5538.000000, reward total was -5.000000. running mean: 3.434519
episode 5539.000000, reward total was 5.000000. running mean: 3.450173
episode 5540.000000, reward total was 2.000000. running mean: 3.435672
episode 5541.000000, reward total was 9.000000. running mean: 3.491315
episode 5542.000000, reward total was 6.000000. running mean: 3.516402
episode 5543.000000, reward total was 9.000000. running mean: 3.571238
epi

episode 5645.000000, reward total was 12.000000. running mean: 4.010273
episode 5646.000000, reward total was 6.000000. running mean: 4.030171
episode 5647.000000, reward total was -5.000000. running mean: 3.939869
episode 5648.000000, reward total was 6.000000. running mean: 3.960470
episode 5649.000000, reward total was -2.000000. running mean: 3.900866
episode 5650.000000, reward total was -4.000000. running mean: 3.821857
episode 5651.000000, reward total was -4.000000. running mean: 3.743638
episode 5652.000000, reward total was -6.000000. running mean: 3.646202
episode 5653.000000, reward total was 7.000000. running mean: 3.679740
episode 5654.000000, reward total was 8.000000. running mean: 3.722943
episode 5655.000000, reward total was 3.000000. running mean: 3.715713
episode 5656.000000, reward total was 2.000000. running mean: 3.698556
episode 5657.000000, reward total was 4.000000. running mean: 3.701570
episode 5658.000000, reward total was 3.000000. running mean: 3.694555


episode 5760.000000, reward total was 4.000000. running mean: 4.336230
episode 5761.000000, reward total was 2.000000. running mean: 4.312867
episode 5762.000000, reward total was 14.000000. running mean: 4.409739
episode 5763.000000, reward total was -1.000000. running mean: 4.355641
episode 5764.000000, reward total was 6.000000. running mean: 4.372085
episode 5765.000000, reward total was 1.000000. running mean: 4.338364
episode 5766.000000, reward total was 7.000000. running mean: 4.364980
episode 5767.000000, reward total was -1.000000. running mean: 4.311331
episode 5768.000000, reward total was 8.000000. running mean: 4.348217
episode 5769.000000, reward total was 6.000000. running mean: 4.364735
episode 5770.000000, reward total was 7.000000. running mean: 4.391088
episode 5771.000000, reward total was 6.000000. running mean: 4.407177
episode 5772.000000, reward total was 10.000000. running mean: 4.463105
episode 5773.000000, reward total was 6.000000. running mean: 4.478474
ep

episode 5875.000000, reward total was -1.000000. running mean: 4.055041
episode 5876.000000, reward total was 6.000000. running mean: 4.074491
episode 5877.000000, reward total was 7.000000. running mean: 4.103746
episode 5878.000000, reward total was 7.000000. running mean: 4.132708
episode 5879.000000, reward total was 5.000000. running mean: 4.141381
episode 5880.000000, reward total was 9.000000. running mean: 4.189967
episode 5881.000000, reward total was 11.000000. running mean: 4.258068
episode 5882.000000, reward total was 3.000000. running mean: 4.245487
episode 5883.000000, reward total was -16.000000. running mean: 4.043032
episode 5884.000000, reward total was 18.000000. running mean: 4.182602
episode 5885.000000, reward total was 12.000000. running mean: 4.260776
episode 5886.000000, reward total was 4.000000. running mean: 4.258168
episode 5887.000000, reward total was -3.000000. running mean: 4.185586
episode 5888.000000, reward total was 10.000000. running mean: 4.24373

episode 5990.000000, reward total was 12.000000. running mean: 4.380396
episode 5991.000000, reward total was 1.000000. running mean: 4.346592
episode 5992.000000, reward total was 5.000000. running mean: 4.353126
episode 5993.000000, reward total was 4.000000. running mean: 4.349595
episode 5994.000000, reward total was 3.000000. running mean: 4.336099
episode 5995.000000, reward total was 12.000000. running mean: 4.412738
episode 5996.000000, reward total was -5.000000. running mean: 4.318611
episode 5997.000000, reward total was 3.000000. running mean: 4.305425
episode 5998.000000, reward total was 1.000000. running mean: 4.272370
episode 5999.000000, reward total was 8.000000. running mean: 4.309647
episode 6000.000000, reward total was 14.000000. running mean: 4.406550
CPU times: total: 1d 7h 29min 28s
Wall time: 1d 12h 37min 39s


In [7]:
play_game(env, model)

See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(
  logger.warn(


Episode finished without success, accumulated reward = 1.0
