In [1]:
import gym
import numpy as np
rm='Pong-v4'

In [2]:
%matplotlib inline
from matplotlib import animation
import matplotlib.pyplot as plt
from IPython.display import display, HTML

def display_frames_as_gif(frames):
    plt.figure(figsize=(frames[0].shape[1] / 72.0, frames[0].shape[0] / 72.0), dpi = 144)
    patch = plt.imshow(frames[0])
    plt.axis('off')
    def animate(i):
        patch.set_data(frames[i])
    anim = animation.FuncAnimation(plt.gcf(), animate, frames = len(frames), interval=50)
    plt.close(anim._fig)
    display(HTML(anim.to_jshtml()))

In [3]:
from gym.wrappers import AtariPreprocessing
gym.new_step_api=True
env = gym.make(rm)
# model initialization
H = 1600 # number of hidden layer neurons
D = 80 * 80 # input dimensionality: 80x80 grid
model = {}
model['W1'] = np.random.randn(H,D) / np.sqrt(D) # "Xavier" initialization
model['W2'] = np.random.randn(H) / np.sqrt(H)
# hyperparameters
batch_size = 10 # every how many episodes to do a param update?
learning_rate = 1e-4
#learning_rate = 1e-3
gamma = 0.99 # discount factor for reward
decay_rate = 0.99 # decay factor for RMSProp leaky sum of grad^2
grad_buffer = { k : np.zeros_like(v) for k,v in model.items() } # update buffers that add up gradients over a batch
rmsprop_cache = { k : np.zeros_like(v) for k,v in model.items() } # rmsprop memory

def sigmoid(x): 
  return 1.0 / (1.0 + np.exp(-x)) # sigmoid "squashing" function to interval [0,1]

def prepro(I):
  """ prepro 210x160x3 uint8 frame into 6400 (80x80) 1D float vector """
  I=np.asarray(I)
  I = I[35:195] # crop
  
  I = I[::2,::2,0] # downsample by factor of 2
  
  I[I == 144] = 0 # erase background (background type 1)
  
  I[I == 109] = 0 # erase background (background type 2)
  
  I[I != 0] = 1 # everything else (paddles, ball) just set to 1
  return I.astype(float).ravel()

def discount_rewards(r):
  """ take 1D float array of rewards and compute discounted reward """
  discounted_r = np.zeros_like(r)
  running_add = 0
  for t in reversed(range(0, r.size)):
    if r[t] != 0: running_add = 0 # reset the sum, since this was a game boundary (pong specific!)
    running_add = running_add * gamma + r[t]
    discounted_r[t] = running_add
  return discounted_r

def policy_forward(x):
  h = np.dot(model['W1'], x)
  h[h<0] = 0 # ReLU nonlinearity
  logp = np.dot(model['W2'], h)
  p = sigmoid(logp)
  return p, h # return probability of taking action 2, and hidden state

def policy_backward(epx, eph, epdlogp):
  """ backward pass. (eph is array of intermediate hidden states) """
  dW2 = np.dot(eph.T, epdlogp).ravel()
  dh = np.outer(epdlogp, model['W2'])
  dh[eph <= 0] = 0 # backpro prelu
  dW1 = np.dot(dh.T, epx)
  return {'W1':dW1, 'W2':dW2}

def model_step(model, observation, prev_x):
  # preprocess the observation, set input to network to be difference image
  cur_x = prepro(observation)
  x = cur_x - prev_x if prev_x is not None else np.zeros(D)
  prev_x = cur_x
  
  # forward the policy network and sample an action from the returned probability
  aprob, _ = policy_forward(x)
  action = 2 if aprob >= 0.5 else 3 # roll the dice!
  
  return action, prev_x

def play_game(env, model):
  observation = env.reset()

  frames = []
  cumulated_reward = 0

  prev_x = None # used in computing the difference frame

  for t in range(1000):
      frames.append(env.render(mode = 'rgb_array'))
      action, prev_x = model_step(model, observation, prev_x)
      observation, reward, done, info = env.step(action)
      cumulated_reward += reward
      if done:
          print("Episode finished after {} timesteps, accumulated reward = {}".format(t+1, cumulated_reward))
          break
  print("Episode finished without success, accumulated reward = {}".format(cumulated_reward))
  env.close()
  display_frames_as_gif(frames)

def train_model(env, model, total_episodes = 100):
  hist = []
  observation = env.reset()

  prev_x = None # used in computing the difference frame
  xs,hs,dlogps,drs = [],[],[],[]
  running_reward = None
  reward_sum = 0
  episode_number = 0

  while True:
    # preprocess the observation, set input to network to be difference image
    cur_x = prepro(observation)
    x = cur_x - prev_x if prev_x is not None else np.zeros(D)
    prev_x = cur_x

    # forward the policy network and sample an action from the returned probability
    aprob, h = policy_forward(x)
    action = 2 if np.random.uniform() < aprob else 3 # roll the dice!

    # record various intermediates (needed later for backprop)
    xs.append(x) # observation
    hs.append(h) # hidden state
    y = 1 if action == 2 else 0 # a "fake label"
    dlogps.append(y - aprob) # grad that encourages the action that was taken to be taken (see http://cs231n.github.io/neural-networks-2/#losses if confused)

    # step the environment and get new measurements
    observation, reward, done, info = env.step(action)
    reward_sum += reward

    drs.append(reward) # record reward (has to be done after we call step() to get reward for previous action)

    if done: # an episode finished
      episode_number += 1

      # stack together all inputs, hidden states, action gradients, and rewards for this episode
      epx = np.vstack(xs)
      eph = np.vstack(hs)
      epdlogp = np.vstack(dlogps)
      epr = np.vstack(drs)
      xs,hs,dlogps,drs = [],[],[],[] # reset array memory

      # compute the discounted reward backwards through time
      discounted_epr = discount_rewards(epr)
      # standardize the rewards to be unit normal (helps control the gradient estimator variance)
      discounted_epr -= np.mean(discounted_epr)
      discounted_epr /= np.std(discounted_epr)

      epdlogp *= discounted_epr # modulate the gradient with advantage (PG magic happens right here.)
      grad = policy_backward(epx, eph, epdlogp)
      for k in model: grad_buffer[k] += grad[k] # accumulate grad over batch

      # perform rmsprop parameter update every batch_size episodes
      if episode_number % batch_size == 0:
        for k,v in model.items():
          g = grad_buffer[k] # gradient
          rmsprop_cache[k] = decay_rate * rmsprop_cache[k] + (1 - decay_rate) * g**2
          model[k] += learning_rate * g / (np.sqrt(rmsprop_cache[k]) + 1e-5)
          grad_buffer[k] = np.zeros_like(v) # reset batch gradient buffer

      # boring book-keeping
      running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
      hist.append((episode_number, reward_sum, running_reward))
      print ('episode %f, reward total was %f. running mean: %f' % (episode_number, reward_sum, running_reward))
      reward_sum = 0
      observation = env.reset() # reset env
      prev_x = None
      if episode_number == total_episodes: 
        return hist

    # if reward != 0: # Pong has either +1 or -1 reward exactly when game ends.
    #   print('ep {}: game finished, reward: {}'.format(episode_number, reward) + ('' if reward == -1 else ' !!!!!!!!'))

  deprecation(
  deprecation(


In [4]:
env.action_space

Discrete(6)

In [5]:
env.unwrapped.get_action_meanings()

['NOOP', 'FIRE', 'RIGHT', 'LEFT', 'RIGHTFIRE', 'LEFTFIRE']

In [6]:
%time hist1 = train_model(env, model, total_episodes=6000)

  logger.deprecation(


episode 1.000000, reward total was -21.000000. running mean: -21.000000
episode 2.000000, reward total was -21.000000. running mean: -21.000000
episode 3.000000, reward total was -21.000000. running mean: -21.000000
episode 4.000000, reward total was -19.000000. running mean: -20.980000
episode 5.000000, reward total was -20.000000. running mean: -20.970200
episode 6.000000, reward total was -21.000000. running mean: -20.970498
episode 7.000000, reward total was -21.000000. running mean: -20.970793
episode 8.000000, reward total was -19.000000. running mean: -20.951085
episode 9.000000, reward total was -20.000000. running mean: -20.941574
episode 10.000000, reward total was -20.000000. running mean: -20.932158
episode 11.000000, reward total was -21.000000. running mean: -20.932837
episode 12.000000, reward total was -21.000000. running mean: -20.933509
episode 13.000000, reward total was -21.000000. running mean: -20.934173
episode 14.000000, reward total was -20.000000. running mean

episode 114.000000, reward total was -21.000000. running mean: -20.673947
episode 115.000000, reward total was -21.000000. running mean: -20.677208
episode 116.000000, reward total was -21.000000. running mean: -20.680436
episode 117.000000, reward total was -21.000000. running mean: -20.683631
episode 118.000000, reward total was -21.000000. running mean: -20.686795
episode 119.000000, reward total was -21.000000. running mean: -20.689927
episode 120.000000, reward total was -21.000000. running mean: -20.693028
episode 121.000000, reward total was -21.000000. running mean: -20.696097
episode 122.000000, reward total was -21.000000. running mean: -20.699136
episode 123.000000, reward total was -21.000000. running mean: -20.702145
episode 124.000000, reward total was -20.000000. running mean: -20.695124
episode 125.000000, reward total was -20.000000. running mean: -20.688172
episode 126.000000, reward total was -21.000000. running mean: -20.691291
episode 127.000000, reward total was -

episode 225.000000, reward total was -20.000000. running mean: -20.362841
episode 226.000000, reward total was -20.000000. running mean: -20.359213
episode 227.000000, reward total was -21.000000. running mean: -20.365621
episode 228.000000, reward total was -21.000000. running mean: -20.371964
episode 229.000000, reward total was -19.000000. running mean: -20.358245
episode 230.000000, reward total was -21.000000. running mean: -20.364662
episode 231.000000, reward total was -20.000000. running mean: -20.361016
episode 232.000000, reward total was -19.000000. running mean: -20.347406
episode 233.000000, reward total was -19.000000. running mean: -20.333932
episode 234.000000, reward total was -21.000000. running mean: -20.340592
episode 235.000000, reward total was -19.000000. running mean: -20.327186
episode 236.000000, reward total was -21.000000. running mean: -20.333914
episode 237.000000, reward total was -21.000000. running mean: -20.340575
episode 238.000000, reward total was -

episode 336.000000, reward total was -20.000000. running mean: -20.369867
episode 337.000000, reward total was -21.000000. running mean: -20.376168
episode 338.000000, reward total was -21.000000. running mean: -20.382406
episode 339.000000, reward total was -21.000000. running mean: -20.388582
episode 340.000000, reward total was -19.000000. running mean: -20.374697
episode 341.000000, reward total was -21.000000. running mean: -20.380950
episode 342.000000, reward total was -20.000000. running mean: -20.377140
episode 343.000000, reward total was -20.000000. running mean: -20.373369
episode 344.000000, reward total was -21.000000. running mean: -20.379635
episode 345.000000, reward total was -19.000000. running mean: -20.365839
episode 346.000000, reward total was -20.000000. running mean: -20.362180
episode 347.000000, reward total was -19.000000. running mean: -20.348558
episode 348.000000, reward total was -20.000000. running mean: -20.345073
episode 349.000000, reward total was -

episode 447.000000, reward total was -19.000000. running mean: -20.115769
episode 448.000000, reward total was -21.000000. running mean: -20.124611
episode 449.000000, reward total was -18.000000. running mean: -20.103365
episode 450.000000, reward total was -20.000000. running mean: -20.102332
episode 451.000000, reward total was -20.000000. running mean: -20.101308
episode 452.000000, reward total was -19.000000. running mean: -20.090295
episode 453.000000, reward total was -21.000000. running mean: -20.099392
episode 454.000000, reward total was -21.000000. running mean: -20.108398
episode 455.000000, reward total was -21.000000. running mean: -20.117314
episode 456.000000, reward total was -18.000000. running mean: -20.096141
episode 457.000000, reward total was -20.000000. running mean: -20.095180
episode 458.000000, reward total was -20.000000. running mean: -20.094228
episode 459.000000, reward total was -20.000000. running mean: -20.093286
episode 460.000000, reward total was -

episode 558.000000, reward total was -20.000000. running mean: -20.227813
episode 559.000000, reward total was -19.000000. running mean: -20.215535
episode 560.000000, reward total was -21.000000. running mean: -20.223379
episode 561.000000, reward total was -20.000000. running mean: -20.221146
episode 562.000000, reward total was -21.000000. running mean: -20.228934
episode 563.000000, reward total was -21.000000. running mean: -20.236645
episode 564.000000, reward total was -21.000000. running mean: -20.244278
episode 565.000000, reward total was -21.000000. running mean: -20.251836
episode 566.000000, reward total was -21.000000. running mean: -20.259317
episode 567.000000, reward total was -21.000000. running mean: -20.266724
episode 568.000000, reward total was -19.000000. running mean: -20.254057
episode 569.000000, reward total was -20.000000. running mean: -20.251516
episode 570.000000, reward total was -21.000000. running mean: -20.259001
episode 571.000000, reward total was -

episode 669.000000, reward total was -20.000000. running mean: -20.081886
episode 670.000000, reward total was -20.000000. running mean: -20.081067
episode 671.000000, reward total was -19.000000. running mean: -20.070257
episode 672.000000, reward total was -20.000000. running mean: -20.069554
episode 673.000000, reward total was -20.000000. running mean: -20.068859
episode 674.000000, reward total was -21.000000. running mean: -20.078170
episode 675.000000, reward total was -20.000000. running mean: -20.077388
episode 676.000000, reward total was -21.000000. running mean: -20.086614
episode 677.000000, reward total was -21.000000. running mean: -20.095748
episode 678.000000, reward total was -21.000000. running mean: -20.104791
episode 679.000000, reward total was -21.000000. running mean: -20.113743
episode 680.000000, reward total was -20.000000. running mean: -20.112605
episode 681.000000, reward total was -21.000000. running mean: -20.121479
episode 682.000000, reward total was -

episode 780.000000, reward total was -21.000000. running mean: -20.075168
episode 781.000000, reward total was -20.000000. running mean: -20.074416
episode 782.000000, reward total was -17.000000. running mean: -20.043672
episode 783.000000, reward total was -19.000000. running mean: -20.033235
episode 784.000000, reward total was -19.000000. running mean: -20.022903
episode 785.000000, reward total was -20.000000. running mean: -20.022674
episode 786.000000, reward total was -20.000000. running mean: -20.022447
episode 787.000000, reward total was -19.000000. running mean: -20.012222
episode 788.000000, reward total was -20.000000. running mean: -20.012100
episode 789.000000, reward total was -21.000000. running mean: -20.021979
episode 790.000000, reward total was -20.000000. running mean: -20.021759
episode 791.000000, reward total was -17.000000. running mean: -19.991542
episode 792.000000, reward total was -20.000000. running mean: -19.991626
episode 793.000000, reward total was -

episode 891.000000, reward total was -20.000000. running mean: -19.936991
episode 892.000000, reward total was -21.000000. running mean: -19.947621
episode 893.000000, reward total was -20.000000. running mean: -19.948145
episode 894.000000, reward total was -19.000000. running mean: -19.938663
episode 895.000000, reward total was -21.000000. running mean: -19.949277
episode 896.000000, reward total was -20.000000. running mean: -19.949784
episode 897.000000, reward total was -19.000000. running mean: -19.940286
episode 898.000000, reward total was -21.000000. running mean: -19.950883
episode 899.000000, reward total was -21.000000. running mean: -19.961374
episode 900.000000, reward total was -21.000000. running mean: -19.971761
episode 901.000000, reward total was -20.000000. running mean: -19.972043
episode 902.000000, reward total was -18.000000. running mean: -19.952323
episode 903.000000, reward total was -20.000000. running mean: -19.952799
episode 904.000000, reward total was -

episode 1002.000000, reward total was -21.000000. running mean: -19.869271
episode 1003.000000, reward total was -21.000000. running mean: -19.880578
episode 1004.000000, reward total was -21.000000. running mean: -19.891772
episode 1005.000000, reward total was -20.000000. running mean: -19.892855
episode 1006.000000, reward total was -20.000000. running mean: -19.893926
episode 1007.000000, reward total was -20.000000. running mean: -19.894987
episode 1008.000000, reward total was -20.000000. running mean: -19.896037
episode 1009.000000, reward total was -21.000000. running mean: -19.907077
episode 1010.000000, reward total was -20.000000. running mean: -19.908006
episode 1011.000000, reward total was -18.000000. running mean: -19.888926
episode 1012.000000, reward total was -21.000000. running mean: -19.900037
episode 1013.000000, reward total was -21.000000. running mean: -19.911036
episode 1014.000000, reward total was -20.000000. running mean: -19.911926
episode 1015.000000, rewa

episode 1112.000000, reward total was -21.000000. running mean: -19.938976
episode 1113.000000, reward total was -17.000000. running mean: -19.909586
episode 1114.000000, reward total was -20.000000. running mean: -19.910490
episode 1115.000000, reward total was -19.000000. running mean: -19.901385
episode 1116.000000, reward total was -20.000000. running mean: -19.902371
episode 1117.000000, reward total was -20.000000. running mean: -19.903347
episode 1118.000000, reward total was -20.000000. running mean: -19.904314
episode 1119.000000, reward total was -18.000000. running mean: -19.885271
episode 1120.000000, reward total was -19.000000. running mean: -19.876418
episode 1121.000000, reward total was -20.000000. running mean: -19.877654
episode 1122.000000, reward total was -19.000000. running mean: -19.868877
episode 1123.000000, reward total was -20.000000. running mean: -19.870189
episode 1124.000000, reward total was -21.000000. running mean: -19.881487
episode 1125.000000, rewa

episode 1222.000000, reward total was -20.000000. running mean: -19.708139
episode 1223.000000, reward total was -20.000000. running mean: -19.711058
episode 1224.000000, reward total was -21.000000. running mean: -19.723947
episode 1225.000000, reward total was -18.000000. running mean: -19.706708
episode 1226.000000, reward total was -20.000000. running mean: -19.709641
episode 1227.000000, reward total was -18.000000. running mean: -19.692544
episode 1228.000000, reward total was -18.000000. running mean: -19.675619
episode 1229.000000, reward total was -20.000000. running mean: -19.678863
episode 1230.000000, reward total was -19.000000. running mean: -19.672074
episode 1231.000000, reward total was -20.000000. running mean: -19.675353
episode 1232.000000, reward total was -21.000000. running mean: -19.688600
episode 1233.000000, reward total was -19.000000. running mean: -19.681714
episode 1234.000000, reward total was -20.000000. running mean: -19.684897
episode 1235.000000, rewa

episode 1332.000000, reward total was -21.000000. running mean: -19.687990
episode 1333.000000, reward total was -20.000000. running mean: -19.691110
episode 1334.000000, reward total was -20.000000. running mean: -19.694199
episode 1335.000000, reward total was -21.000000. running mean: -19.707257
episode 1336.000000, reward total was -18.000000. running mean: -19.690184
episode 1337.000000, reward total was -16.000000. running mean: -19.653283
episode 1338.000000, reward total was -21.000000. running mean: -19.666750
episode 1339.000000, reward total was -21.000000. running mean: -19.680082
episode 1340.000000, reward total was -20.000000. running mean: -19.683281
episode 1341.000000, reward total was -21.000000. running mean: -19.696449
episode 1342.000000, reward total was -20.000000. running mean: -19.699484
episode 1343.000000, reward total was -20.000000. running mean: -19.702489
episode 1344.000000, reward total was -20.000000. running mean: -19.705464
episode 1345.000000, rewa

episode 1442.000000, reward total was -20.000000. running mean: -19.522249
episode 1443.000000, reward total was -21.000000. running mean: -19.537027
episode 1444.000000, reward total was -21.000000. running mean: -19.551657
episode 1445.000000, reward total was -18.000000. running mean: -19.536140
episode 1446.000000, reward total was -16.000000. running mean: -19.500779
episode 1447.000000, reward total was -18.000000. running mean: -19.485771
episode 1448.000000, reward total was -19.000000. running mean: -19.480913
episode 1449.000000, reward total was -21.000000. running mean: -19.496104
episode 1450.000000, reward total was -21.000000. running mean: -19.511143
episode 1451.000000, reward total was -17.000000. running mean: -19.486032
episode 1452.000000, reward total was -21.000000. running mean: -19.501171
episode 1453.000000, reward total was -20.000000. running mean: -19.506160
episode 1454.000000, reward total was -20.000000. running mean: -19.511098
episode 1455.000000, rewa

episode 1552.000000, reward total was -20.000000. running mean: -19.372128
episode 1553.000000, reward total was -19.000000. running mean: -19.368407
episode 1554.000000, reward total was -19.000000. running mean: -19.364723
episode 1555.000000, reward total was -19.000000. running mean: -19.361076
episode 1556.000000, reward total was -17.000000. running mean: -19.337465
episode 1557.000000, reward total was -20.000000. running mean: -19.344090
episode 1558.000000, reward total was -18.000000. running mean: -19.330649
episode 1559.000000, reward total was -19.000000. running mean: -19.327343
episode 1560.000000, reward total was -16.000000. running mean: -19.294070
episode 1561.000000, reward total was -19.000000. running mean: -19.291129
episode 1562.000000, reward total was -18.000000. running mean: -19.278218
episode 1563.000000, reward total was -21.000000. running mean: -19.295435
episode 1564.000000, reward total was -17.000000. running mean: -19.272481
episode 1565.000000, rewa

episode 1662.000000, reward total was -20.000000. running mean: -19.129588
episode 1663.000000, reward total was -21.000000. running mean: -19.148292
episode 1664.000000, reward total was -18.000000. running mean: -19.136809
episode 1665.000000, reward total was -19.000000. running mean: -19.135441
episode 1666.000000, reward total was -17.000000. running mean: -19.114087
episode 1667.000000, reward total was -17.000000. running mean: -19.092946
episode 1668.000000, reward total was -19.000000. running mean: -19.092016
episode 1669.000000, reward total was -19.000000. running mean: -19.091096
episode 1670.000000, reward total was -17.000000. running mean: -19.070185
episode 1671.000000, reward total was -21.000000. running mean: -19.089484
episode 1672.000000, reward total was -20.000000. running mean: -19.098589
episode 1673.000000, reward total was -16.000000. running mean: -19.067603
episode 1674.000000, reward total was -21.000000. running mean: -19.086927
episode 1675.000000, rewa

episode 1772.000000, reward total was -16.000000. running mean: -18.976108
episode 1773.000000, reward total was -21.000000. running mean: -18.996347
episode 1774.000000, reward total was -19.000000. running mean: -18.996383
episode 1775.000000, reward total was -20.000000. running mean: -19.006419
episode 1776.000000, reward total was -20.000000. running mean: -19.016355
episode 1777.000000, reward total was -18.000000. running mean: -19.006192
episode 1778.000000, reward total was -17.000000. running mean: -18.986130
episode 1779.000000, reward total was -16.000000. running mean: -18.956268
episode 1780.000000, reward total was -20.000000. running mean: -18.966706
episode 1781.000000, reward total was -21.000000. running mean: -18.987039
episode 1782.000000, reward total was -20.000000. running mean: -18.997168
episode 1783.000000, reward total was -20.000000. running mean: -19.007197
episode 1784.000000, reward total was -17.000000. running mean: -18.987125
episode 1785.000000, rewa

episode 1882.000000, reward total was -19.000000. running mean: -18.932403
episode 1883.000000, reward total was -19.000000. running mean: -18.933079
episode 1884.000000, reward total was -19.000000. running mean: -18.933748
episode 1885.000000, reward total was -20.000000. running mean: -18.944411
episode 1886.000000, reward total was -19.000000. running mean: -18.944967
episode 1887.000000, reward total was -18.000000. running mean: -18.935517
episode 1888.000000, reward total was -19.000000. running mean: -18.936162
episode 1889.000000, reward total was -20.000000. running mean: -18.946800
episode 1890.000000, reward total was -18.000000. running mean: -18.937332
episode 1891.000000, reward total was -21.000000. running mean: -18.957959
episode 1892.000000, reward total was -20.000000. running mean: -18.968379
episode 1893.000000, reward total was -19.000000. running mean: -18.968696
episode 1894.000000, reward total was -17.000000. running mean: -18.949009
episode 1895.000000, rewa

episode 1992.000000, reward total was -20.000000. running mean: -18.712649
episode 1993.000000, reward total was -20.000000. running mean: -18.725522
episode 1994.000000, reward total was -19.000000. running mean: -18.728267
episode 1995.000000, reward total was -20.000000. running mean: -18.740985
episode 1996.000000, reward total was -19.000000. running mean: -18.743575
episode 1997.000000, reward total was -18.000000. running mean: -18.736139
episode 1998.000000, reward total was -13.000000. running mean: -18.678778
episode 1999.000000, reward total was -19.000000. running mean: -18.681990
episode 2000.000000, reward total was -19.000000. running mean: -18.685170
episode 2001.000000, reward total was -21.000000. running mean: -18.708318
episode 2002.000000, reward total was -21.000000. running mean: -18.731235
episode 2003.000000, reward total was -17.000000. running mean: -18.713923
episode 2004.000000, reward total was -19.000000. running mean: -18.716783
episode 2005.000000, rewa

episode 2102.000000, reward total was -18.000000. running mean: -18.598531
episode 2103.000000, reward total was -21.000000. running mean: -18.622546
episode 2104.000000, reward total was -19.000000. running mean: -18.626320
episode 2105.000000, reward total was -20.000000. running mean: -18.640057
episode 2106.000000, reward total was -19.000000. running mean: -18.643657
episode 2107.000000, reward total was -19.000000. running mean: -18.647220
episode 2108.000000, reward total was -15.000000. running mean: -18.610748
episode 2109.000000, reward total was -20.000000. running mean: -18.624640
episode 2110.000000, reward total was -19.000000. running mean: -18.628394
episode 2111.000000, reward total was -19.000000. running mean: -18.632110
episode 2112.000000, reward total was -18.000000. running mean: -18.625789
episode 2113.000000, reward total was -17.000000. running mean: -18.609531
episode 2114.000000, reward total was -19.000000. running mean: -18.613436
episode 2115.000000, rewa

episode 2212.000000, reward total was -13.000000. running mean: -18.532004
episode 2213.000000, reward total was -20.000000. running mean: -18.546684
episode 2214.000000, reward total was -18.000000. running mean: -18.541217
episode 2215.000000, reward total was -16.000000. running mean: -18.515805
episode 2216.000000, reward total was -17.000000. running mean: -18.500647
episode 2217.000000, reward total was -18.000000. running mean: -18.495641
episode 2218.000000, reward total was -18.000000. running mean: -18.490684
episode 2219.000000, reward total was -21.000000. running mean: -18.515778
episode 2220.000000, reward total was -17.000000. running mean: -18.500620
episode 2221.000000, reward total was -19.000000. running mean: -18.505614
episode 2222.000000, reward total was -17.000000. running mean: -18.490557
episode 2223.000000, reward total was -18.000000. running mean: -18.485652
episode 2224.000000, reward total was -18.000000. running mean: -18.480795
episode 2225.000000, rewa

episode 2322.000000, reward total was -20.000000. running mean: -18.303741
episode 2323.000000, reward total was -19.000000. running mean: -18.310704
episode 2324.000000, reward total was -21.000000. running mean: -18.337597
episode 2325.000000, reward total was -18.000000. running mean: -18.334221
episode 2326.000000, reward total was -17.000000. running mean: -18.320878
episode 2327.000000, reward total was -20.000000. running mean: -18.337670
episode 2328.000000, reward total was -17.000000. running mean: -18.324293
episode 2329.000000, reward total was -21.000000. running mean: -18.351050
episode 2330.000000, reward total was -17.000000. running mean: -18.337540
episode 2331.000000, reward total was -21.000000. running mean: -18.364164
episode 2332.000000, reward total was -21.000000. running mean: -18.390523
episode 2333.000000, reward total was -15.000000. running mean: -18.356617
episode 2334.000000, reward total was -21.000000. running mean: -18.383051
episode 2335.000000, rewa

episode 2432.000000, reward total was -16.000000. running mean: -18.195103
episode 2433.000000, reward total was -20.000000. running mean: -18.213152
episode 2434.000000, reward total was -21.000000. running mean: -18.241020
episode 2435.000000, reward total was -18.000000. running mean: -18.238610
episode 2436.000000, reward total was -19.000000. running mean: -18.246224
episode 2437.000000, reward total was -21.000000. running mean: -18.273762
episode 2438.000000, reward total was -20.000000. running mean: -18.291024
episode 2439.000000, reward total was -15.000000. running mean: -18.258114
episode 2440.000000, reward total was -19.000000. running mean: -18.265533
episode 2441.000000, reward total was -15.000000. running mean: -18.232877
episode 2442.000000, reward total was -19.000000. running mean: -18.240549
episode 2443.000000, reward total was -14.000000. running mean: -18.198143
episode 2444.000000, reward total was -16.000000. running mean: -18.176162
episode 2445.000000, rewa

episode 2542.000000, reward total was -18.000000. running mean: -17.813758
episode 2543.000000, reward total was -17.000000. running mean: -17.805621
episode 2544.000000, reward total was -15.000000. running mean: -17.777565
episode 2545.000000, reward total was -13.000000. running mean: -17.729789
episode 2546.000000, reward total was -19.000000. running mean: -17.742491
episode 2547.000000, reward total was -16.000000. running mean: -17.725066
episode 2548.000000, reward total was -21.000000. running mean: -17.757816
episode 2549.000000, reward total was -14.000000. running mean: -17.720237
episode 2550.000000, reward total was -18.000000. running mean: -17.723035
episode 2551.000000, reward total was -19.000000. running mean: -17.735805
episode 2552.000000, reward total was -15.000000. running mean: -17.708447
episode 2553.000000, reward total was -19.000000. running mean: -17.721362
episode 2554.000000, reward total was -18.000000. running mean: -17.724149
episode 2555.000000, rewa

episode 2652.000000, reward total was -17.000000. running mean: -17.687255
episode 2653.000000, reward total was -19.000000. running mean: -17.700382
episode 2654.000000, reward total was -16.000000. running mean: -17.683378
episode 2655.000000, reward total was -19.000000. running mean: -17.696545
episode 2656.000000, reward total was -19.000000. running mean: -17.709579
episode 2657.000000, reward total was -18.000000. running mean: -17.712483
episode 2658.000000, reward total was -16.000000. running mean: -17.695358
episode 2659.000000, reward total was -17.000000. running mean: -17.688405
episode 2660.000000, reward total was -16.000000. running mean: -17.671521
episode 2661.000000, reward total was -17.000000. running mean: -17.664806
episode 2662.000000, reward total was -12.000000. running mean: -17.608158
episode 2663.000000, reward total was -16.000000. running mean: -17.592076
episode 2664.000000, reward total was -21.000000. running mean: -17.626155
episode 2665.000000, rewa

episode 2762.000000, reward total was -19.000000. running mean: -17.426796
episode 2763.000000, reward total was -18.000000. running mean: -17.432528
episode 2764.000000, reward total was -17.000000. running mean: -17.428203
episode 2765.000000, reward total was -16.000000. running mean: -17.413921
episode 2766.000000, reward total was -17.000000. running mean: -17.409782
episode 2767.000000, reward total was -16.000000. running mean: -17.395684
episode 2768.000000, reward total was -11.000000. running mean: -17.331727
episode 2769.000000, reward total was -15.000000. running mean: -17.308410
episode 2770.000000, reward total was -14.000000. running mean: -17.275326
episode 2771.000000, reward total was -18.000000. running mean: -17.282572
episode 2772.000000, reward total was -15.000000. running mean: -17.259747
episode 2773.000000, reward total was -19.000000. running mean: -17.277149
episode 2774.000000, reward total was -15.000000. running mean: -17.254378
episode 2775.000000, rewa

episode 2872.000000, reward total was -19.000000. running mean: -17.383791
episode 2873.000000, reward total was -19.000000. running mean: -17.399953
episode 2874.000000, reward total was -17.000000. running mean: -17.395954
episode 2875.000000, reward total was -17.000000. running mean: -17.391994
episode 2876.000000, reward total was -19.000000. running mean: -17.408074
episode 2877.000000, reward total was -15.000000. running mean: -17.383994
episode 2878.000000, reward total was -14.000000. running mean: -17.350154
episode 2879.000000, reward total was -15.000000. running mean: -17.326652
episode 2880.000000, reward total was -17.000000. running mean: -17.323386
episode 2881.000000, reward total was -15.000000. running mean: -17.300152
episode 2882.000000, reward total was -16.000000. running mean: -17.287150
episode 2883.000000, reward total was -17.000000. running mean: -17.284279
episode 2884.000000, reward total was -16.000000. running mean: -17.271436
episode 2885.000000, rewa

episode 2982.000000, reward total was -17.000000. running mean: -16.901784
episode 2983.000000, reward total was -17.000000. running mean: -16.902766
episode 2984.000000, reward total was -12.000000. running mean: -16.853739
episode 2985.000000, reward total was -17.000000. running mean: -16.855201
episode 2986.000000, reward total was -15.000000. running mean: -16.836649
episode 2987.000000, reward total was -19.000000. running mean: -16.858283
episode 2988.000000, reward total was -18.000000. running mean: -16.869700
episode 2989.000000, reward total was -17.000000. running mean: -16.871003
episode 2990.000000, reward total was -15.000000. running mean: -16.852293
episode 2991.000000, reward total was -17.000000. running mean: -16.853770
episode 2992.000000, reward total was -15.000000. running mean: -16.835232
episode 2993.000000, reward total was -14.000000. running mean: -16.806880
episode 2994.000000, reward total was -18.000000. running mean: -16.818811
episode 2995.000000, rewa

episode 3092.000000, reward total was -19.000000. running mean: -16.854893
episode 3093.000000, reward total was -15.000000. running mean: -16.836344
episode 3094.000000, reward total was -15.000000. running mean: -16.817981
episode 3095.000000, reward total was -18.000000. running mean: -16.829801
episode 3096.000000, reward total was -19.000000. running mean: -16.851503
episode 3097.000000, reward total was -17.000000. running mean: -16.852988
episode 3098.000000, reward total was -16.000000. running mean: -16.844458
episode 3099.000000, reward total was -20.000000. running mean: -16.876013
episode 3100.000000, reward total was -15.000000. running mean: -16.857253
episode 3101.000000, reward total was -19.000000. running mean: -16.878681
episode 3102.000000, reward total was -16.000000. running mean: -16.869894
episode 3103.000000, reward total was -18.000000. running mean: -16.881195
episode 3104.000000, reward total was -19.000000. running mean: -16.902383
episode 3105.000000, rewa

episode 3202.000000, reward total was -18.000000. running mean: -16.663039
episode 3203.000000, reward total was -13.000000. running mean: -16.626409
episode 3204.000000, reward total was -18.000000. running mean: -16.640145
episode 3205.000000, reward total was -19.000000. running mean: -16.663743
episode 3206.000000, reward total was -14.000000. running mean: -16.637106
episode 3207.000000, reward total was -17.000000. running mean: -16.640735
episode 3208.000000, reward total was -21.000000. running mean: -16.684328
episode 3209.000000, reward total was -14.000000. running mean: -16.657484
episode 3210.000000, reward total was -15.000000. running mean: -16.640909
episode 3211.000000, reward total was -16.000000. running mean: -16.634500
episode 3212.000000, reward total was -19.000000. running mean: -16.658155
episode 3213.000000, reward total was -18.000000. running mean: -16.671574
episode 3214.000000, reward total was -16.000000. running mean: -16.664858
episode 3215.000000, rewa

episode 3312.000000, reward total was -20.000000. running mean: -16.564611
episode 3313.000000, reward total was -12.000000. running mean: -16.518965
episode 3314.000000, reward total was -16.000000. running mean: -16.513775
episode 3315.000000, reward total was -18.000000. running mean: -16.528637
episode 3316.000000, reward total was -19.000000. running mean: -16.553351
episode 3317.000000, reward total was -12.000000. running mean: -16.507818
episode 3318.000000, reward total was -17.000000. running mean: -16.512739
episode 3319.000000, reward total was -13.000000. running mean: -16.477612
episode 3320.000000, reward total was -19.000000. running mean: -16.502836
episode 3321.000000, reward total was -16.000000. running mean: -16.497808
episode 3322.000000, reward total was -14.000000. running mean: -16.472829
episode 3323.000000, reward total was -12.000000. running mean: -16.428101
episode 3324.000000, reward total was -16.000000. running mean: -16.423820
episode 3325.000000, rewa

episode 3422.000000, reward total was -13.000000. running mean: -16.148159
episode 3423.000000, reward total was -17.000000. running mean: -16.156677
episode 3424.000000, reward total was -17.000000. running mean: -16.165110
episode 3425.000000, reward total was -19.000000. running mean: -16.193459
episode 3426.000000, reward total was -17.000000. running mean: -16.201525
episode 3427.000000, reward total was -15.000000. running mean: -16.189509
episode 3428.000000, reward total was -17.000000. running mean: -16.197614
episode 3429.000000, reward total was -20.000000. running mean: -16.235638
episode 3430.000000, reward total was -15.000000. running mean: -16.223282
episode 3431.000000, reward total was -18.000000. running mean: -16.241049
episode 3432.000000, reward total was -17.000000. running mean: -16.248638
episode 3433.000000, reward total was -13.000000. running mean: -16.216152
episode 3434.000000, reward total was -20.000000. running mean: -16.253991
episode 3435.000000, rewa

episode 3532.000000, reward total was -14.000000. running mean: -15.841960
episode 3533.000000, reward total was -14.000000. running mean: -15.823541
episode 3534.000000, reward total was -17.000000. running mean: -15.835305
episode 3535.000000, reward total was -19.000000. running mean: -15.866952
episode 3536.000000, reward total was -15.000000. running mean: -15.858283
episode 3537.000000, reward total was -15.000000. running mean: -15.849700
episode 3538.000000, reward total was -16.000000. running mean: -15.851203
episode 3539.000000, reward total was -17.000000. running mean: -15.862691
episode 3540.000000, reward total was -15.000000. running mean: -15.854064
episode 3541.000000, reward total was -17.000000. running mean: -15.865523
episode 3542.000000, reward total was -17.000000. running mean: -15.876868
episode 3543.000000, reward total was -7.000000. running mean: -15.788099
episode 3544.000000, reward total was -16.000000. running mean: -15.790218
episode 3545.000000, rewar

episode 3642.000000, reward total was -18.000000. running mean: -15.638643
episode 3643.000000, reward total was -13.000000. running mean: -15.612257
episode 3644.000000, reward total was -18.000000. running mean: -15.636134
episode 3645.000000, reward total was -17.000000. running mean: -15.649773
episode 3646.000000, reward total was -15.000000. running mean: -15.643275
episode 3647.000000, reward total was -17.000000. running mean: -15.656842
episode 3648.000000, reward total was -18.000000. running mean: -15.680274
episode 3649.000000, reward total was -19.000000. running mean: -15.713471
episode 3650.000000, reward total was -19.000000. running mean: -15.746337
episode 3651.000000, reward total was -13.000000. running mean: -15.718873
episode 3652.000000, reward total was -17.000000. running mean: -15.731684
episode 3653.000000, reward total was -15.000000. running mean: -15.724368
episode 3654.000000, reward total was -15.000000. running mean: -15.717124
episode 3655.000000, rewa

episode 3752.000000, reward total was -19.000000. running mean: -15.651114
episode 3753.000000, reward total was -18.000000. running mean: -15.674603
episode 3754.000000, reward total was -11.000000. running mean: -15.627857
episode 3755.000000, reward total was -15.000000. running mean: -15.621579
episode 3756.000000, reward total was -14.000000. running mean: -15.605363
episode 3757.000000, reward total was -14.000000. running mean: -15.589309
episode 3758.000000, reward total was -14.000000. running mean: -15.573416
episode 3759.000000, reward total was -12.000000. running mean: -15.537682
episode 3760.000000, reward total was -19.000000. running mean: -15.572305
episode 3761.000000, reward total was -15.000000. running mean: -15.566582
episode 3762.000000, reward total was -16.000000. running mean: -15.570916
episode 3763.000000, reward total was -16.000000. running mean: -15.575207
episode 3764.000000, reward total was -15.000000. running mean: -15.569455
episode 3765.000000, rewa

episode 3862.000000, reward total was -14.000000. running mean: -14.708896
episode 3863.000000, reward total was -10.000000. running mean: -14.661807
episode 3864.000000, reward total was -15.000000. running mean: -14.665189
episode 3865.000000, reward total was -12.000000. running mean: -14.638537
episode 3866.000000, reward total was -18.000000. running mean: -14.672152
episode 3867.000000, reward total was -19.000000. running mean: -14.715430
episode 3868.000000, reward total was -15.000000. running mean: -14.718276
episode 3869.000000, reward total was -13.000000. running mean: -14.701093
episode 3870.000000, reward total was -15.000000. running mean: -14.704082
episode 3871.000000, reward total was -16.000000. running mean: -14.717041
episode 3872.000000, reward total was -11.000000. running mean: -14.679871
episode 3873.000000, reward total was -17.000000. running mean: -14.703072
episode 3874.000000, reward total was -15.000000. running mean: -14.706041
episode 3875.000000, rewa

episode 3972.000000, reward total was -17.000000. running mean: -14.932181
episode 3973.000000, reward total was -20.000000. running mean: -14.982859
episode 3974.000000, reward total was -15.000000. running mean: -14.983031
episode 3975.000000, reward total was -15.000000. running mean: -14.983200
episode 3976.000000, reward total was -14.000000. running mean: -14.973368
episode 3977.000000, reward total was -16.000000. running mean: -14.983635
episode 3978.000000, reward total was -14.000000. running mean: -14.973798
episode 3979.000000, reward total was -10.000000. running mean: -14.924060
episode 3980.000000, reward total was -14.000000. running mean: -14.914820
episode 3981.000000, reward total was -16.000000. running mean: -14.925672
episode 3982.000000, reward total was -11.000000. running mean: -14.886415
episode 3983.000000, reward total was -19.000000. running mean: -14.927551
episode 3984.000000, reward total was -17.000000. running mean: -14.948275
episode 3985.000000, rewa

episode 4082.000000, reward total was -14.000000. running mean: -14.970715
episode 4083.000000, reward total was -16.000000. running mean: -14.981008
episode 4084.000000, reward total was -14.000000. running mean: -14.971198
episode 4085.000000, reward total was -13.000000. running mean: -14.951486
episode 4086.000000, reward total was -16.000000. running mean: -14.961971
episode 4087.000000, reward total was -15.000000. running mean: -14.962351
episode 4088.000000, reward total was -19.000000. running mean: -15.002728
episode 4089.000000, reward total was -20.000000. running mean: -15.052700
episode 4090.000000, reward total was -17.000000. running mean: -15.072173
episode 4091.000000, reward total was -17.000000. running mean: -15.091452
episode 4092.000000, reward total was -11.000000. running mean: -15.050537
episode 4093.000000, reward total was -16.000000. running mean: -15.060032
episode 4094.000000, reward total was -15.000000. running mean: -15.059431
episode 4095.000000, rewa

episode 4192.000000, reward total was -11.000000. running mean: -14.464050
episode 4193.000000, reward total was -13.000000. running mean: -14.449409
episode 4194.000000, reward total was -18.000000. running mean: -14.484915
episode 4195.000000, reward total was -4.000000. running mean: -14.380066
episode 4196.000000, reward total was -16.000000. running mean: -14.396265
episode 4197.000000, reward total was -13.000000. running mean: -14.382303
episode 4198.000000, reward total was -9.000000. running mean: -14.328479
episode 4199.000000, reward total was -17.000000. running mean: -14.355195
episode 4200.000000, reward total was -15.000000. running mean: -14.361643
episode 4201.000000, reward total was -14.000000. running mean: -14.358026
episode 4202.000000, reward total was -15.000000. running mean: -14.364446
episode 4203.000000, reward total was -15.000000. running mean: -14.370802
episode 4204.000000, reward total was -7.000000. running mean: -14.297094
episode 4205.000000, reward 

episode 4302.000000, reward total was -9.000000. running mean: -14.487915
episode 4303.000000, reward total was -19.000000. running mean: -14.533036
episode 4304.000000, reward total was -16.000000. running mean: -14.547705
episode 4305.000000, reward total was -14.000000. running mean: -14.542228
episode 4306.000000, reward total was -15.000000. running mean: -14.546806
episode 4307.000000, reward total was -19.000000. running mean: -14.591338
episode 4308.000000, reward total was -20.000000. running mean: -14.645425
episode 4309.000000, reward total was -13.000000. running mean: -14.628970
episode 4310.000000, reward total was -13.000000. running mean: -14.612681
episode 4311.000000, reward total was -15.000000. running mean: -14.616554
episode 4312.000000, reward total was -19.000000. running mean: -14.660388
episode 4313.000000, reward total was -16.000000. running mean: -14.673784
episode 4314.000000, reward total was -13.000000. running mean: -14.657047
episode 4315.000000, rewar

episode 4412.000000, reward total was -10.000000. running mean: -14.196965
episode 4413.000000, reward total was -11.000000. running mean: -14.164995
episode 4414.000000, reward total was -17.000000. running mean: -14.193345
episode 4415.000000, reward total was -17.000000. running mean: -14.221412
episode 4416.000000, reward total was -17.000000. running mean: -14.249197
episode 4417.000000, reward total was -11.000000. running mean: -14.216705
episode 4418.000000, reward total was -15.000000. running mean: -14.224538
episode 4419.000000, reward total was -12.000000. running mean: -14.202293
episode 4420.000000, reward total was -14.000000. running mean: -14.200270
episode 4421.000000, reward total was -13.000000. running mean: -14.188267
episode 4422.000000, reward total was -12.000000. running mean: -14.166385
episode 4423.000000, reward total was -6.000000. running mean: -14.084721
episode 4424.000000, reward total was -12.000000. running mean: -14.063874
episode 4425.000000, rewar

episode 4522.000000, reward total was -17.000000. running mean: -13.475184
episode 4523.000000, reward total was -11.000000. running mean: -13.450432
episode 4524.000000, reward total was -15.000000. running mean: -13.465928
episode 4525.000000, reward total was -9.000000. running mean: -13.421269
episode 4526.000000, reward total was -17.000000. running mean: -13.457056
episode 4527.000000, reward total was -13.000000. running mean: -13.452485
episode 4528.000000, reward total was -17.000000. running mean: -13.487960
episode 4529.000000, reward total was -18.000000. running mean: -13.533081
episode 4530.000000, reward total was -13.000000. running mean: -13.527750
episode 4531.000000, reward total was -16.000000. running mean: -13.552472
episode 4532.000000, reward total was -10.000000. running mean: -13.516948
episode 4533.000000, reward total was -15.000000. running mean: -13.531778
episode 4534.000000, reward total was -15.000000. running mean: -13.546461
episode 4535.000000, rewar

episode 4632.000000, reward total was -12.000000. running mean: -13.221700
episode 4633.000000, reward total was -14.000000. running mean: -13.229483
episode 4634.000000, reward total was -11.000000. running mean: -13.207188
episode 4635.000000, reward total was -11.000000. running mean: -13.185116
episode 4636.000000, reward total was -14.000000. running mean: -13.193265
episode 4637.000000, reward total was -13.000000. running mean: -13.191332
episode 4638.000000, reward total was -17.000000. running mean: -13.229419
episode 4639.000000, reward total was -17.000000. running mean: -13.267125
episode 4640.000000, reward total was -12.000000. running mean: -13.254453
episode 4641.000000, reward total was 5.000000. running mean: -13.071909
episode 4642.000000, reward total was -16.000000. running mean: -13.101190
episode 4643.000000, reward total was -11.000000. running mean: -13.080178
episode 4644.000000, reward total was -16.000000. running mean: -13.109376
episode 4645.000000, reward

episode 4742.000000, reward total was -17.000000. running mean: -13.089346
episode 4743.000000, reward total was -12.000000. running mean: -13.078453
episode 4744.000000, reward total was -4.000000. running mean: -12.987668
episode 4745.000000, reward total was -18.000000. running mean: -13.037792
episode 4746.000000, reward total was -14.000000. running mean: -13.047414
episode 4747.000000, reward total was -9.000000. running mean: -13.006940
episode 4748.000000, reward total was -12.000000. running mean: -12.996870
episode 4749.000000, reward total was -15.000000. running mean: -13.016901
episode 4750.000000, reward total was -9.000000. running mean: -12.976732
episode 4751.000000, reward total was -15.000000. running mean: -12.996965
episode 4752.000000, reward total was -15.000000. running mean: -13.016995
episode 4753.000000, reward total was -11.000000. running mean: -12.996825
episode 4754.000000, reward total was -5.000000. running mean: -12.916857
episode 4755.000000, reward t

episode 4852.000000, reward total was -12.000000. running mean: -13.009361
episode 4853.000000, reward total was -17.000000. running mean: -13.049267
episode 4854.000000, reward total was -14.000000. running mean: -13.058775
episode 4855.000000, reward total was -17.000000. running mean: -13.098187
episode 4856.000000, reward total was -17.000000. running mean: -13.137205
episode 4857.000000, reward total was -15.000000. running mean: -13.155833
episode 4858.000000, reward total was -9.000000. running mean: -13.114275
episode 4859.000000, reward total was -16.000000. running mean: -13.143132
episode 4860.000000, reward total was -15.000000. running mean: -13.161700
episode 4861.000000, reward total was -15.000000. running mean: -13.180083
episode 4862.000000, reward total was -15.000000. running mean: -13.198283
episode 4863.000000, reward total was -11.000000. running mean: -13.176300
episode 4864.000000, reward total was -12.000000. running mean: -13.164537
episode 4865.000000, rewar

episode 4962.000000, reward total was -11.000000. running mean: -12.824525
episode 4963.000000, reward total was -5.000000. running mean: -12.746280
episode 4964.000000, reward total was -11.000000. running mean: -12.728817
episode 4965.000000, reward total was -11.000000. running mean: -12.711529
episode 4966.000000, reward total was -12.000000. running mean: -12.704413
episode 4967.000000, reward total was -3.000000. running mean: -12.607369
episode 4968.000000, reward total was -3.000000. running mean: -12.511295
episode 4969.000000, reward total was -14.000000. running mean: -12.526182
episode 4970.000000, reward total was -15.000000. running mean: -12.550921
episode 4971.000000, reward total was -14.000000. running mean: -12.565411
episode 4972.000000, reward total was -10.000000. running mean: -12.539757
episode 4973.000000, reward total was -9.000000. running mean: -12.504360
episode 4974.000000, reward total was -11.000000. running mean: -12.489316
episode 4975.000000, reward t

episode 5072.000000, reward total was -16.000000. running mean: -12.100770
episode 5073.000000, reward total was -13.000000. running mean: -12.109762
episode 5074.000000, reward total was -10.000000. running mean: -12.088664
episode 5075.000000, reward total was -9.000000. running mean: -12.057778
episode 5076.000000, reward total was -1.000000. running mean: -11.947200
episode 5077.000000, reward total was -11.000000. running mean: -11.937728
episode 5078.000000, reward total was -14.000000. running mean: -11.958351
episode 5079.000000, reward total was -15.000000. running mean: -11.988767
episode 5080.000000, reward total was -6.000000. running mean: -11.928879
episode 5081.000000, reward total was -5.000000. running mean: -11.859591
episode 5082.000000, reward total was -11.000000. running mean: -11.850995
episode 5083.000000, reward total was -15.000000. running mean: -11.882485
episode 5084.000000, reward total was -10.000000. running mean: -11.863660
episode 5085.000000, reward t

episode 5182.000000, reward total was -12.000000. running mean: -11.304402
episode 5183.000000, reward total was -16.000000. running mean: -11.351358
episode 5184.000000, reward total was -17.000000. running mean: -11.407845
episode 5185.000000, reward total was -14.000000. running mean: -11.433766
episode 5186.000000, reward total was -13.000000. running mean: -11.449429
episode 5187.000000, reward total was -15.000000. running mean: -11.484934
episode 5188.000000, reward total was -7.000000. running mean: -11.440085
episode 5189.000000, reward total was -7.000000. running mean: -11.395684
episode 5190.000000, reward total was -15.000000. running mean: -11.431727
episode 5191.000000, reward total was -12.000000. running mean: -11.437410
episode 5192.000000, reward total was -10.000000. running mean: -11.423036
episode 5193.000000, reward total was -16.000000. running mean: -11.468806
episode 5194.000000, reward total was -20.000000. running mean: -11.554117
episode 5195.000000, reward

episode 5292.000000, reward total was -14.000000. running mean: -11.153294
episode 5293.000000, reward total was -11.000000. running mean: -11.151761
episode 5294.000000, reward total was -17.000000. running mean: -11.210243
episode 5295.000000, reward total was -11.000000. running mean: -11.208141
episode 5296.000000, reward total was -17.000000. running mean: -11.266059
episode 5297.000000, reward total was -9.000000. running mean: -11.243399
episode 5298.000000, reward total was -9.000000. running mean: -11.220965
episode 5299.000000, reward total was -12.000000. running mean: -11.228755
episode 5300.000000, reward total was -4.000000. running mean: -11.156467
episode 5301.000000, reward total was -12.000000. running mean: -11.164903
episode 5302.000000, reward total was -10.000000. running mean: -11.153254
episode 5303.000000, reward total was -7.000000. running mean: -11.111721
episode 5304.000000, reward total was -5.000000. running mean: -11.050604
episode 5305.000000, reward to

episode 5402.000000, reward total was -6.000000. running mean: -11.183446
episode 5403.000000, reward total was -11.000000. running mean: -11.181612
episode 5404.000000, reward total was -15.000000. running mean: -11.219796
episode 5405.000000, reward total was -8.000000. running mean: -11.187598
episode 5406.000000, reward total was -13.000000. running mean: -11.205722
episode 5407.000000, reward total was -6.000000. running mean: -11.153665
episode 5408.000000, reward total was -7.000000. running mean: -11.112128
episode 5409.000000, reward total was -7.000000. running mean: -11.071007
episode 5410.000000, reward total was -11.000000. running mean: -11.070297
episode 5411.000000, reward total was -13.000000. running mean: -11.089594
episode 5412.000000, reward total was -11.000000. running mean: -11.088698
episode 5413.000000, reward total was -10.000000. running mean: -11.077811
episode 5414.000000, reward total was -15.000000. running mean: -11.117033
episode 5415.000000, reward to

episode 5512.000000, reward total was -12.000000. running mean: -11.111171
episode 5513.000000, reward total was -14.000000. running mean: -11.140059
episode 5514.000000, reward total was -12.000000. running mean: -11.148658
episode 5515.000000, reward total was -10.000000. running mean: -11.137172
episode 5516.000000, reward total was -16.000000. running mean: -11.185800
episode 5517.000000, reward total was -12.000000. running mean: -11.193942
episode 5518.000000, reward total was -6.000000. running mean: -11.142003
episode 5519.000000, reward total was -11.000000. running mean: -11.140583
episode 5520.000000, reward total was -11.000000. running mean: -11.139177
episode 5521.000000, reward total was -8.000000. running mean: -11.107785
episode 5522.000000, reward total was -7.000000. running mean: -11.066707
episode 5523.000000, reward total was -14.000000. running mean: -11.096040
episode 5524.000000, reward total was -11.000000. running mean: -11.095080
episode 5525.000000, reward 

episode 5622.000000, reward total was -11.000000. running mean: -10.904229
episode 5623.000000, reward total was -3.000000. running mean: -10.825187
episode 5624.000000, reward total was -7.000000. running mean: -10.786935
episode 5625.000000, reward total was -10.000000. running mean: -10.779066
episode 5626.000000, reward total was -16.000000. running mean: -10.831275
episode 5627.000000, reward total was -15.000000. running mean: -10.872962
episode 5628.000000, reward total was 4.000000. running mean: -10.724232
episode 5629.000000, reward total was -11.000000. running mean: -10.726990
episode 5630.000000, reward total was -14.000000. running mean: -10.759720
episode 5631.000000, reward total was -11.000000. running mean: -10.762123
episode 5632.000000, reward total was -11.000000. running mean: -10.764502
episode 5633.000000, reward total was -9.000000. running mean: -10.746857
episode 5634.000000, reward total was -14.000000. running mean: -10.779388
episode 5635.000000, reward to

episode 5732.000000, reward total was -17.000000. running mean: -11.078950
episode 5733.000000, reward total was -8.000000. running mean: -11.048161
episode 5734.000000, reward total was -15.000000. running mean: -11.087679
episode 5735.000000, reward total was -19.000000. running mean: -11.166802
episode 5736.000000, reward total was -7.000000. running mean: -11.125134
episode 5737.000000, reward total was -8.000000. running mean: -11.093883
episode 5738.000000, reward total was -8.000000. running mean: -11.062944
episode 5739.000000, reward total was -13.000000. running mean: -11.082315
episode 5740.000000, reward total was -4.000000. running mean: -11.011491
episode 5741.000000, reward total was -13.000000. running mean: -11.031377
episode 5742.000000, reward total was -3.000000. running mean: -10.951063
episode 5743.000000, reward total was -13.000000. running mean: -10.971552
episode 5744.000000, reward total was -7.000000. running mean: -10.931837
episode 5745.000000, reward tota

episode 5842.000000, reward total was -6.000000. running mean: -10.376525
episode 5843.000000, reward total was -14.000000. running mean: -10.412759
episode 5844.000000, reward total was -7.000000. running mean: -10.378632
episode 5845.000000, reward total was -11.000000. running mean: -10.384845
episode 5846.000000, reward total was -10.000000. running mean: -10.380997
episode 5847.000000, reward total was -4.000000. running mean: -10.317187
episode 5848.000000, reward total was -11.000000. running mean: -10.324015
episode 5849.000000, reward total was -9.000000. running mean: -10.310775
episode 5850.000000, reward total was -15.000000. running mean: -10.357667
episode 5851.000000, reward total was -19.000000. running mean: -10.444090
episode 5852.000000, reward total was -7.000000. running mean: -10.409650
episode 5853.000000, reward total was -12.000000. running mean: -10.425553
episode 5854.000000, reward total was -6.000000. running mean: -10.381298
episode 5855.000000, reward tot

episode 5953.000000, reward total was -14.000000. running mean: -9.966381
episode 5954.000000, reward total was -8.000000. running mean: -9.946717
episode 5955.000000, reward total was -15.000000. running mean: -9.997250
episode 5956.000000, reward total was -5.000000. running mean: -9.947277
episode 5957.000000, reward total was -10.000000. running mean: -9.947804
episode 5958.000000, reward total was -11.000000. running mean: -9.958326
episode 5959.000000, reward total was -7.000000. running mean: -9.928743
episode 5960.000000, reward total was -15.000000. running mean: -9.979456
episode 5961.000000, reward total was -1.000000. running mean: -9.889661
episode 5962.000000, reward total was -10.000000. running mean: -9.890764
episode 5963.000000, reward total was -11.000000. running mean: -9.901857
episode 5964.000000, reward total was -2.000000. running mean: -9.822838
episode 5965.000000, reward total was -10.000000. running mean: -9.824610
episode 5966.000000, reward total was -11.0

In [7]:
play_game(env, model)

See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(
  logger.warn(


Episode finished without success, accumulated reward = 3.0
