# Some Basic RL Agents

This Notebook will create some generic agents to work with Gym's environments.



In [1]:
# As usual, a bit of setup
from __future__ import print_function
import time
import numpy as np
import matplotlib.pyplot as plt
import platform
import gym

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

print("Python version: ", platform.python_version())

Python version:  3.6.4


## Random Agent

The agent simply take the action space passed to it as a parameter and randomly sample from it whenever an action is requested.

In [2]:
import gym
from gym import wrappers


class RandomAgent(object):
    """The world's simplest agent! - Randomly sample from the action space """
    def __init__(self, action_space):
        self.action_space = action_space    # Action space is passed to the agent as a parameter

    def act(self, observation, reward, done):
        return self.action_space.sample()   # Randomly sample from action space

In [3]:
import gym

env = gym.make('CartPole-v0')

# You provide the directory to write to (can be an existing
# directory, including one with existing data -- all monitor files
# will be namespaced). You can also dump to a tempdir if you'd
# like: tempfile.mkdtemp().
outdir = '/tmp/random-agent-results'
env = wrappers.Monitor(env, directory=outdir, force=True)  # Delete previous content in directory 

env.seed(0)
agent = RandomAgent(env.action_space)   # Select RandomAgent as agent

episode_count = 100
reward = 0
running_reward = 0   # Reward Stat - a running average of the episode reward total
done = False

for i in range(episode_count):
    ob = env.reset()
    reward_sum = 0  
    while True:
        action = agent.act(ob, reward, done)
        ob, reward, done, _ = env.step(action)
        reward_sum += reward
        if done:
            # Calculate running average
            running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
            print ('Episode reward total was %f. running mean: %f' % (reward_sum, running_reward))
            break
        # Note there's no env.render() here. But the environment still can open window and
        # render if asked by env.monitor: it calls env.render('rgb_array') to record video.
        # Video is not recorded every episode, see capped_cubic_video_schedule for details.

# Close the env and write monitor result info to disk
env.close()


[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
Episode reward total was 14.000000. running mean: 0.140000
Episode reward total was 12.000000. running mean: 0.258600
Episode reward total was 12.000000. running mean: 0.376014
Episode reward total was 20.000000. running mean: 0.572254
Episode reward total was 25.000000. running mean: 0.816531
Episode reward total was 10.000000. running mean: 0.908366
Episode reward total was 18.000000. running mean: 1.079282
Episode reward total was 12.000000. running mean: 1.188490
Episode reward total was 11.000000. running mean: 1.286605
Episode reward total was 15.000000. running mean: 1.423739
Episode reward total was 56.000000. running mean: 1.969501
Episode reward total was 18.000000. running mean: 2.129806
Episode reward total was 11.000000. running mean: 2.218508
Episode reward total was 23.000000. running mean: 2.426323
Episode reward total was 70.000000. running mean: 3.102060
Episode

In [4]:
env.close()

## CEM Agent

OpenAI's implementation of the cross-entropy method for maximizing a black-box function. 

In a nutshell the CE method consists of two phases:

* Generate a random data sample (trajectories, vectors, etc.) according to a specified mechanism.
* Update the parameters of the random mechanism based on the data to produce a "better" sample in the next iteration. This step involves minimizing the cross-entropy or Kullback–Leibler divergence.

[Link to Wikipedia](https://en.wikipedia.org/wiki/Cross-entropy_method)

One should always try a BB gun before reaching for the Bazooka. In the case of Reinforcement Learning for example, one strong baseline that should always be tried first is the cross-entropy method (CEM), a simple stochastic hill-climbing “guess and check” approach inspired loosely by evolution.

And for Cartpole, CEM solves the problem (Achieving Episode mean reward: 200.000) in just 4 iterations.

In [3]:
from __future__ import print_function

import gym
from gym import wrappers
import logging
import numpy as np
try:
    import cPickle as pickle
except ImportError:
    import pickle
import json, sys, os
from os import path
from examples.agents._policies import BinaryActionLinearPolicy # Support code for cem.py
import argparse

def cem(f, th_mean, batch_size, n_iter, elite_frac, initial_std=1.0):
    """
    Generic implementation of the cross-entropy method for maximizing a black-box function

    f: a function mapping from vector -> scalar
    th_mean: initial mean over input distribution
    batch_size: number of samples of theta to evaluate per batch
    n_iter: number of batches
    elite_frac: for each batch, select this fraction of the top-performing samples
    initial_std: initial standard deviation over parameter vectors
    """
    n_elite = int(np.round(batch_size*elite_frac))
    th_std = np.ones_like(th_mean) * initial_std

    for _ in range(n_iter):
        ths = np.array([th_mean + dth for dth in  th_std[None,:]*np.random.randn(batch_size, th_mean.size)])
        ys = np.array([f(th) for th in ths])
        elite_inds = ys.argsort()[::-1][:n_elite]
        elite_ths = ths[elite_inds]
        th_mean = elite_ths.mean(axis=0)
        th_std = elite_ths.std(axis=0)
        yield {'ys' : ys, 'theta_mean' : th_mean, 'y_mean' : ys.mean()}

def do_rollout(agent, env, num_steps, render=False):
    total_rew = 0
    ob = env.reset()
    for t in range(num_steps):
        a = agent.act(ob)
        (ob, reward, done, _info) = env.step(a)
        total_rew += reward
        if render and t%3==0: env.render()
        if done: break
    return total_rew, t+1

In [5]:
import gym

env = gym.make('CartPole-v0')
env.seed(0)
np.random.seed(0)

params = dict(n_iter=10, batch_size=25, elite_frac = 0.2)
num_steps = 200
display = True

outdir = '/tmp/cem-agent-results'
env = wrappers.Monitor(env, directory=outdir, force=True)

# Prepare snapshotting
# ----------------------------------------
def writefile(fname, s):
    with open(path.join(outdir, fname), 'w') as fh: fh.write(s)

info = {}
info['params'] = params
info['argv'] = sys.argv
info['env_id'] = env.spec.id
# ------------------------------------------

def noisy_evaluation(theta):
    agent = BinaryActionLinearPolicy(theta)
    rew, T = do_rollout(agent, env, num_steps)
    return rew

# Train the agent, and snapshot each stage
for (i, iterdata) in enumerate(
    cem(noisy_evaluation, np.zeros(env.observation_space.shape[0]+1), **params)):
    print('Iteration %2i. Episode mean reward: %7.3f'%(i, iterdata['y_mean']))
    agent = BinaryActionLinearPolicy(iterdata['theta_mean'])
    if display: do_rollout(agent, env, 200, render=True)
    writefile('agent-%.4i.pkl'%i, str(pickle.dumps(agent, -1)))

# Write out the env at the end so we store the parameters of this
# environment.
writefile('info.json', json.dumps(info))

env.close()



[2017-08-31 16:06:38,241] Making new env: CartPole-v0
[2017-08-31 16:06:38,244] Finished writing results. You can upload them to the scoreboard via gym.upload('/tmp/cem-agent-results')
[2017-08-31 16:06:38,250] Clearing 8 monitor files from previous run (because force=True was provided)
[2017-08-31 16:06:38,252] Starting new video recorder writing to /tmp/cem-agent-results/openaigym.video.1.3995.video000000.mp4
[2017-08-31 16:06:38,521] Starting new video recorder writing to /tmp/cem-agent-results/openaigym.video.1.3995.video000001.mp4
[2017-08-31 16:06:38,871] Starting new video recorder writing to /tmp/cem-agent-results/openaigym.video.1.3995.video000008.mp4


Iteration  0. Episode mean reward:  25.640


[2017-08-31 16:06:39,790] Starting new video recorder writing to /tmp/cem-agent-results/openaigym.video.1.3995.video000027.mp4


Iteration  1. Episode mean reward:  88.160


[2017-08-31 16:06:41,724] Starting new video recorder writing to /tmp/cem-agent-results/openaigym.video.1.3995.video000064.mp4


Iteration  2. Episode mean reward: 174.560
Iteration  3. Episode mean reward: 193.400


[2017-08-31 16:06:45,885] Starting new video recorder writing to /tmp/cem-agent-results/openaigym.video.1.3995.video000125.mp4


Iteration  4. Episode mean reward: 200.000
Iteration  5. Episode mean reward: 200.000
Iteration  6. Episode mean reward: 200.000
Iteration  7. Episode mean reward: 200.000


[2017-08-31 16:06:53,781] Starting new video recorder writing to /tmp/cem-agent-results/openaigym.video.1.3995.video000216.mp4


Iteration  8. Episode mean reward: 200.000
Iteration  9. Episode mean reward: 200.000


[2017-08-31 16:06:59,358] Finished writing results. You can upload them to the scoreboard via gym.upload('/tmp/cem-agent-results')


## Policy Gradient Agent

We will now bring out the Bazooka. Policy gradients (PG) is currently the default choice for attacking RL problems:
* It works better than Q Learning when tuned well. 
* It is end-to-end: there’s an explicit policy and a principled approach that directly optimizes the expected reward

I will adapt the Bazooka to the Cartpole problem.

Karpathy's Bazooka is complex because he applies policy gradient to a more complex problem (Pong). We need to rebuild a simpler and more basic solution for Classic Control problems such as Cartpole.

The Pong environment is different from that of CartPole:
* The PG agent accept the difference between 2 successive frames of the Atari Pong screen as input
* This difference frame is feed into a 2-layer NN to generate a single value - logprob for action=2 "RIGHT" 
* The reward of +1 or -1 is given at the end of a game between the agent and the Atari AI
* An episode consists of 21 games; so after 21 games --> done=1
* A batch of 10 episodes is played before there is an update to the NN

In [23]:
from __future__ import print_function

import numpy as np
try:
    import cPickle as pickle
except ImportError:
    import pickle
import gym

# hyperparameters
H = 10 # number of hidden layer neurons - Is 100 an overkill?
batch_size = 10 # How many episodes before we do a param update?
learning_rate = 1e-5
gamma = 0.99 # discount factor for reward
decay_rate = 0.99 # decay factor for RMSProp leaky sum of grad^2
render = True

# Neural Network initialization
D = 4 # input dimensionality: 4 (cartpole-v0 observation)
model = {}
model['W1'] = np.random.randn(H,D) / np.sqrt(D) # "Xavier" initialization
model['W2'] = np.random.randn(H) / np.sqrt(H)

print("The dimensions of the 2 layer NN:")
print(model['W1'].shape)
print(model['W2'].shape)

def sigmoid(x): 
  return 1.0 / (1.0 + np.exp(-x)) # sigmoid "squashing" function to interval [0,1]

def policy_forward(x):
  h = np.dot(model['W1'], x)
  h[h<0] = 0 # ReLU nonlinearity
  logp = np.dot(model['W2'], h)
  p = sigmoid(logp)
  return p, h # return probability of taking action 1, and hidden state

env = gym.make('CartPole-v0')
observation = env.reset()

xs,hs,dlogps,drs = [],[],[],[]
running_reward = None
reward_sum = 0
episode_number = 0

while True:
  if render: env.render()
  time.sleep(0.5)  # Slow the interaction down

  x = observation
    
  # forward the policy network and sample an action from the returned probability
  aprob, h = policy_forward(x)
  action = 1 if np.random.uniform() < aprob else 0 # roll the dice!
    
  # step the environment and get new measurements
  observation, reward, done, info = env.step(action)
    
  print("observation",x)
  print("aprob:",aprob)
  print("action",action)

  reward_sum += reward

  if done: # an episode finished
    episode_number += 1
    
    running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
    print ('resetting env. episode reward total was %f. running mean: %f' % (reward_sum, running_reward))  

    reward_sum = 0  # Reset reward sum
    observation = env.reset() # reset env
    
"""
# Update to python3.5 format iteritems --> items
grad_buffer = { k : np.zeros_like(v) for k,v in model.items() } # update buffers that add up gradients over a batch
rmsprop_cache = { k : np.zeros_like(v) for k,v in model.items() } # rmsprop memory


def discount_rewards(r):
  #take 1D float array of rewards and compute discounted reward
  discounted_r = np.zeros_like(r)
  running_add = 0
  for t in reversed(range(0, r.size)):  # Update to python3.5  xrange--> range
    if r[t] != 0: running_add = 0 # reset the sum, since this was a game boundary (pong specific!)
    running_add = running_add * gamma + r[t]
    discounted_r[t] = running_add
  return discounted_r


def policy_backward(eph, epdlogp):
  # backward pass. (eph is array of intermediate hidden states)
  dW2 = np.dot(eph.T, epdlogp).ravel()
  dh = np.outer(epdlogp, model['W2'])
  dh[eph <= 0] = 0 # backpro prelu
  dW1 = np.dot(dh.T, epx)
  return {'W1':dW1, 'W2':dW2}
"""

[2017-09-01 18:03:49,278] Making new env: CartPole-v0


The dimensions of the 2 layer NN:
(10, 4)
(10,)
observation [ 0.04961266 -0.0103578  -0.02179311  0.04812695]
aprob: 0.503316204225
action 0
observation [ 0.04940551 -0.20516059 -0.02083057  0.33385507]
aprob: 0.560592399845
action 0
observation [ 0.04530229 -0.39997997 -0.01415347  0.61989699]
aprob: 0.620750803654
action 0
observation [ 0.03730269 -0.59490141 -0.00175553  0.90808889]
aprob: 0.678782029508
action 1
observation [ 0.02540467 -0.39975574  0.01640625  0.61485471]
aprob: 0.626149139203
action 0
observation [ 0.01740955 -0.59510304  0.02870334  0.91265945]
aprob: 0.685034990519
action 1
observation [ 0.00550749 -0.40038093  0.04695653  0.62913421]
aprob: 0.634266831585
action 1
observation [-0.00250013 -0.20594463  0.05953921  0.35160139]
aprob: 0.580026386327
action 1
observation [-0.00661902 -0.01171769  0.06657124  0.0782714 ]
aprob: 0.523411871633
action 1
observation [-0.00685337  0.1823899   0.06813667 -0.19268755]
aprob: 0.518323772018
action 0
observation [-0.003205

observation [ 0.13417533  0.35958553  0.04488739 -0.22352456]
aprob: 0.533043250472
action 1
observation [ 0.14136704  0.55403813  0.0404169  -0.50171721]
aprob: 0.554307327551
action 1
observation [ 0.1524478   0.74856776  0.03038255 -0.78139402]
aprob: 0.57542224666
action 0
observation [ 0.16741916  0.55304164  0.01475467 -0.47930911]
aprob: 0.553898171768
action 0
observation [ 0.17847999  0.35771454  0.00516849 -0.18201263]
aprob: 0.532223182498
action 1
observation [ 0.18563428  0.55276215  0.00152824 -0.47306061]
aprob: 0.553705163627
action 0
observation [ 0.19668953  0.35761865 -0.00793297 -0.1798964 ]
aprob: 0.532098615126
action 0
observation [ 0.2038419   0.16261111 -0.0115309   0.11027343]
aprob: 0.514964708665
action 0
observation [ 0.20709412 -0.03234372 -0.00932543  0.39929624]
aprob: 0.552268698712
action 0
observation [ 0.20644725 -0.22733215 -0.00133951  0.68902452]
aprob: 0.60602249107
action 0
observation [ 0.2019006  -0.42243549  0.01244098  0.98128545]
aprob: 0.6

observation [-0.03573321 -0.02169864 -0.00082817  0.02631425]
aprob: 0.509414735672
action 1
observation [-0.03616718  0.17343518 -0.00030188 -0.26662986]
aprob: 0.519955165755
action 0
observation [-0.03269848 -0.02168246 -0.00563448  0.02595784]
aprob: 0.508330536455
action 0
observation [-0.03313213 -0.21672316 -0.00511532  0.31685771]
aprob: 0.569706353667
action 1
observation [-0.03746659 -0.02152872  0.00122183  0.02256599]
aprob: 0.509312126967
action 1
observation [-0.03789716  0.17357569  0.00167315 -0.26973118]
aprob: 0.520006902478
action 0
observation [-0.03442565 -0.0215701  -0.00372147  0.023479  ]
aprob: 0.508403353983
action 1
observation [-0.03485705  0.17360502 -0.00325189 -0.27037577]
aprob: 0.520043872248
action 0
observation [-0.03138495 -0.02147037 -0.00865941  0.02127972]
aprob: 0.507011720714
action 1
observation [-0.03181436  0.17377469 -0.00823381 -0.27412272]
aprob: 0.5201357931
action 0
observation [-0.02833887 -0.02122882 -0.01371627  0.01595192]
aprob: 0.5

observation [-0.06378691 -0.17948754  0.0969495   0.03635448]
aprob: 0.542871895257
action 1
observation [-0.06737667  0.0141201   0.09767659 -0.2242347 ]
aprob: 0.509232054715
action 0
observation [-0.06709426 -0.18225235  0.0931919   0.09759121]
aprob: 0.551965184629
action 0
observation [-0.07073931 -0.37857782  0.09514372  0.41816035]
aprob: 0.616797837785
action 1
observation [-0.07831087 -0.18492382  0.10350693  0.1569232 ]
aprob: 0.563566453569
action 1
observation [-0.08200934  0.0085756   0.10664539 -0.10139578]
aprob: 0.517248274343
action 1
observation [-0.08183783  0.20202036  0.10461748 -0.35862007]
aprob: 0.522945192666
action 1
observation [-0.07779742  0.39551158  0.09744508 -0.61656859]
aprob: 0.544008617036
action 1
observation [-0.06988719  0.58914683  0.08511371 -0.87704001]
aprob: 0.564977929682
action 0
observation [-0.05810426  0.39297764  0.06757291 -0.55885819]
aprob: 0.543149949419
action 0
observation [-0.0502447   0.1969755   0.05639574 -0.24567463]
aprob: 0

observation [ 0.10350458  0.18703353 -0.07863666 -0.27246537]
aprob: 0.520271802907
action 0
observation [ 0.10724525 -0.00688342 -0.08408597 -0.00558417]
aprob: 0.50546404268
action 1
observation [ 0.10710758  0.18933749 -0.08419765 -0.32356778]
aprob: 0.521189411594
action 0
observation [ 0.11089433 -0.00449093 -0.09066901 -0.05857999]
aprob: 0.505023930795
action 0
observation [ 0.11080451 -0.19820377 -0.09184061  0.20417554]
aprob: 0.524181375474
action 0
observation [ 0.10684043 -0.3919005  -0.0877571   0.46653298]
aprob: 0.581720244673
action 1
observation [ 0.09900242 -0.19565535 -0.07842644  0.14753055]
aprob: 0.518680654601
action 1
observation [ 0.09508932  0.00049698 -0.07547583 -0.1688273 ]
aprob: 0.504471071623
action 0
observation [ 0.09509926 -0.19346804 -0.07885237  0.09912309]
aprob: 0.516614971075
action 1
observation [ 0.0912299   0.00269027 -0.07686991 -0.2173587 ]
aprob: 0.505313172703
action 0
observation [ 0.0912837  -0.19125344 -0.08121708  0.05012028]
aprob: 0.

observation [ 0.04722601 -0.19006163 -0.05482897  0.15703615]
aprob: 0.528300963359
action 0
observation [ 0.04342477 -0.38435745 -0.05168824  0.43193028]
aprob: 0.587452795753
action 1
observation [ 0.03573762 -0.18854318 -0.04304964  0.12341158]
aprob: 0.526053962471
action 1
observation [ 0.03196676  0.00716824 -0.04058141 -0.18253625]
aprob: 0.504258580578
action 1
observation [ 0.03211013  0.20284666 -0.04423213 -0.48773989]
aprob: 0.525176220594
action 0
observation [ 0.03616706  0.0083758  -0.05398693 -0.20931891]
aprob: 0.504799226212
action 0
observation [ 0.03633457 -0.18593431 -0.05817331  0.06585716]
aprob: 0.516620118368
action 0
observation [ 0.03261589 -0.38017606 -0.05685616  0.3396336 ]
aprob: 0.573999511352
action 1
observation [ 0.02501237 -0.18429314 -0.05006349  0.0295773 ]
aprob: 0.515789467461
action 1
observation [ 0.0213265   0.01150968 -0.04947195 -0.27847145]
aprob: 0.507226815061
action 0
observation [ 0.0215567  -0.18287289 -0.05504138 -0.00179316]
aprob: 0

observation [ 0.03111153 -0.15337267  0.03855905  0.36456207]
aprob: 0.5707442118
action 0
observation [ 0.02804408 -0.34902079  0.0458503   0.66914966]
aprob: 0.633060803931
action 1
observation [ 0.02106366 -0.15456535  0.05923329  0.3912483 ]
aprob: 0.57871758316
action 1
observation [ 0.01797235  0.03966813  0.06705825  0.11781345]
aprob: 0.52879355397
action 1
observation [ 0.01876572  0.23376839  0.06941452 -0.1529822 ]
aprob: 0.52206542599
action 0
observation [ 0.02344109  0.03772472  0.06635488  0.16076662]
aprob: 0.534670456742
action 1
observation [ 0.02419558  0.23183706  0.06957021 -0.1102671 ]
aprob: 0.521233974923
action 0
observation [ 0.02883232  0.03579073  0.06736487  0.20352818]
aprob: 0.541075501927
action 0
observation [ 0.02954814 -0.1602267   0.07143543  0.51667773]
aprob: 0.598248577783
action 1
observation [ 0.0263436   0.0338205   0.08176899  0.24733344]
aprob: 0.549993576589
action 1
observation [ 0.02702001  0.22768522  0.08671566 -0.01847873]
aprob: 0.5267

observation [ 0.01436252  0.19085562 -0.04289053 -0.11124933]
aprob: 0.519320466011
action 0
observation [ 0.01817963 -0.00362633 -0.04511552  0.16759936]
aprob: 0.520716419082
action 0
observation [ 0.0181071  -0.19807443 -0.04176353  0.44571526]
aprob: 0.576420006703
action 1
observation [ 0.01414562 -0.00238728 -0.03284923  0.14016557]
aprob: 0.518041891447
action 1
observation [ 0.01409787  0.19318938 -0.03004592 -0.16269708]
aprob: 0.520033776837
action 0
observation [ 0.01796166 -0.00148985 -0.03329986  0.12035779]
aprob: 0.514350227524
action 0
observation [ 0.01793186 -0.19611927 -0.0308927   0.40235172]
aprob: 0.571496385063
action 0
observation [ 0.01400948 -0.39078975 -0.02284567  0.6851371 ]
aprob: 0.6308493247
action 0
observation [ 0.00619368 -0.5855872  -0.00914293  0.97054098]
aprob: 0.687893367221
action 1
observation [-0.00551806 -0.39034373  0.01026789  0.67500003]
aprob: 0.63577812312
action 1
observation [-0.01332494 -0.19536596  0.02376789  0.38556751]
aprob: 0.57

observation [ 0.00074388  0.40983556  0.00224197 -0.5495292 ]
aprob: 0.544608947231
action 0
observation [ 0.00894059  0.21468219 -0.00874861 -0.25614073]
aprob: 0.523004216151
action 1
observation [ 0.01323424  0.40992795 -0.01387143 -0.55157022]
aprob: 0.54468887528
action 0
observation [ 0.02143279  0.21500354 -0.02490283 -0.2632898 ]
aprob: 0.523174113123
action 1
observation [ 0.02573287  0.41047193 -0.03016863 -0.56372216]
aprob: 0.544946197278
action 1
observation [ 0.0339423   0.60600391 -0.04144307 -0.8657549 ]
aprob: 0.566602294764
action 1
observation [ 0.04606238  0.80166467 -0.05875817 -1.1711748 ]
aprob: 0.588090036863
action 1
observation [ 0.06209568  0.99749936 -0.08218166 -1.48168485]
aprob: 0.60935528289
action 1
observation [ 0.08204566  1.19352214 -0.11181536 -1.79886088]
aprob: 0.630341034775
action 1
observation [ 0.10591611  1.38970337 -0.14779258 -2.1241003 ]
aprob: 0.650986464671
action 0
observation [ 0.13371017  1.19632888 -0.19027458 -1.88049367]
aprob: 0.6

observation [ 0.02907203  0.03496261 -0.01225431 -0.02022591]
aprob: 0.503194985317
action 1
observation [ 0.02977128  0.23025813 -0.01265883 -0.31674988]
aprob: 0.524967266846
action 0
observation [ 0.03437644  0.03531876 -0.01899383 -0.02808584]
aprob: 0.50334998443
action 1
observation [ 0.03508282  0.23070787 -0.01955555 -0.32670049]
aprob: 0.525160460778
action 0
observation [ 0.03969698  0.03586972 -0.02608956 -0.04024811]
aprob: 0.503585230492
action 1
observation [ 0.04041437  0.23135589 -0.02689452 -0.34104703]
aprob: 0.525436462134
action 1
observation [ 0.04504149  0.42684996 -0.03371546 -0.64208814]
aprob: 0.547222790036
action 0
observation [ 0.05357849  0.2322138  -0.04655722 -0.36021028]
aprob: 0.525856477905
action 1
observation [ 0.05822276  0.42796557 -0.05376143 -0.66720264]
aprob: 0.547747532882
action 1
observation [ 0.06678207  0.62379234 -0.06710548 -0.97631665]
aprob: 0.569519472944
action 1
observation [ 0.07925792  0.81974697 -0.08663181 -1.28930124]
aprob: 0.

observation [-0.02108644 -0.20602933  0.0534085   0.37703735]
aprob: 0.5846908329
action 0
observation [-0.02520703 -0.40186754  0.06094925  0.68607044]
aprob: 0.646991594827
action 1
observation [-0.03324438 -0.20764228  0.07467066  0.4131815 ]
aprob: 0.594280314045
action 1
observation [-0.03739722 -0.01365381  0.08293429  0.14494242]
aprob: 0.5388227872
action 0
observation [-0.0376703  -0.2098595   0.08583314  0.46259413]
aprob: 0.603570778961
action 0
observation [-0.04186749 -0.40608297  0.09508502  0.78104973]
aprob: 0.666178255731
action 1
observation [-0.04998915 -0.21238772  0.11070601  0.51973218]
aprob: 0.616578393587
action 0
observation [-0.0542369  -0.40887981  0.12110066  0.84514716]
aprob: 0.679304967486
action 0
observation [-0.0624145  -0.60542753  0.1380036   1.17332618]
aprob: 0.737537525875
action 0
observation [-0.07452305 -0.80204659  0.16147012  1.50589517]
aprob: 0.789837635311
action 1
observation [-0.09056398 -0.60920933  0.19158803  1.26766811]
aprob: 0.756

KeyboardInterrupt: 

In [11]:
env = gym.make('CartPole-v0')
observation = env.reset()

xs,hs,dlogps,drs = [],[],[],[]
running_reward = None
reward_sum = 0
episode_number = 0

while True:
  if render: env.render()

  x = observation   # No need to preprop
  print(x)

  # forward the policy network and sample an action from the returned probability
  aprob, h = policy_forward(x)
  print("aprob:",aprob)
  action = 1 if np.random.uniform() < aprob else 0 # roll the dice!
  print("action",action)

  # record various intermediates (needed later for backprop)
  xs.append(x) # observation
  hs.append(h) # hidden state
  y = 1 if action == 1 else 0 # a "fake label"
  dlogps.append(y - aprob) # grad that encourages the action that was taken to be taken (see http://cs231n.github.io/neural-networks-2/#losses if confused)

  # step the environment and get new measurements
  observation, reward, done, info = env.step(action)
  reward_sum += reward

  drs.append(reward) # record reward (has to be done after we call step() to get reward for previous action)

  if done: # an episode finished
    episode_number += 1

    # stack together all inputs, hidden states, action gradients, and rewards for this episode
    epx = np.vstack(xs)
    eph = np.vstack(hs)
    epdlogp = np.vstack(dlogps)
    epr = np.vstack(drs)
    xs,hs,dlogps,drs = [],[],[],[] # reset array memory

    # compute the discounted reward backwards through time
    discounted_epr = discount_rewards(epr)
    # standardize the rewards to be unit normal (helps control the gradient estimator variance)
    discounted_epr -= np.mean(discounted_epr)
    discounted_epr /= np.std(discounted_epr)

    epdlogp *= discounted_epr # modulate the gradient with advantage (PG magic happens right here.)
    grad = policy_backward(eph, epdlogp)
    for k in model: grad_buffer[k] += grad[k] # accumulate grad over batch

    # perform rmsprop parameter update every batch_size episodes
    if episode_number % batch_size == 0:
      for k,v in model.items():   # Update to python3.5 format
        g = grad_buffer[k] # gradient
        rmsprop_cache[k] = decay_rate * rmsprop_cache[k] + (1 - decay_rate) * g**2
        model[k] += learning_rate * g / (np.sqrt(rmsprop_cache[k]) + 1e-5)
        grad_buffer[k] = np.zeros_like(v) # reset batch gradient buffer

    # boring book-keeping
    running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
    print ('resetting env. episode reward total was %f. running mean: %f' % (reward_sum, running_reward))  # Update to python3.5 format
    if episode_number % 100 == 0: pickle.dump(model, open('save.p', 'wb'))
    reward_sum = 0
    observation = env.reset() # reset env
    prev_x = None
  
#  if reward != 0: # Pong has either +1 or -1 reward exactly when game ends.
#    print ('ep %d: game finished, reward: %f' % (episode_number, reward)) 
#    print ('' if reward == -1 else ' !!!!!!!!')

[2017-09-01 10:15:43,450] Making new env: CartPole-v0


[ 0.01715323 -0.0465265  -0.01929824 -0.04573151]
[ 0.  0.  0.  0.]
aprob: 0.5
action 0
[-0.00093053 -0.19483999 -0.00091463  0.28653223]
aprob: 0.49153577418
action 1
[-0.00482733  0.19540477  0.00481601 -0.29898946]
aprob: 0.494732336482
action 1
[-0.00091923  0.19533929 -0.00116377 -0.2975007 ]
aprob: 0.494086187979
action 1
[ 0.00298755  0.19535345 -0.00711379 -0.29785859]
aprob: 0.493408971476
action 1
[ 0.00689462  0.19544347 -0.01307096 -0.30004268]
aprob: 0.492709209596
action 1
[ 0.01080349  0.19559655 -0.01907181 -0.30399749]
aprob: 0.491986080904
action 0
[ 0.01471542 -0.19436493 -0.02515176  0.27469161]
aprob: 0.495648520194
action 0
[ 0.01082812 -0.1939492  -0.01965793  0.26619181]
aprob: 0.495531117033
action 0
[ 0.00694914 -0.19359593 -0.0143341   0.25939049]
aprob: 0.495276787168
action 1
[ 0.00307722  0.19654354 -0.00914629 -0.32659344]
aprob: 0.492967200482
action 1
[ 0.00700809  0.19658897 -0.01567815 -0.32900933]
aprob: 0.492239569864
action 0
[ 0.01093987 -0.193037



[ 0.0160496  -0.19370084 -0.02200461  0.26029898]
aprob: 0.496163522362
action 0
[ 0.01217558 -0.19329231 -0.01679863  0.25256265]
aprob: 0.496011655795
action 0
[ 0.00830973 -0.19296271 -0.01174738  0.24654994]
aprob: 0.495709682736
action 1
[ 0.00445048  0.19689152 -0.00681638 -0.33566903]
aprob: 0.492794687549
action 0
[ 0.00838831 -0.19265595 -0.01352976  0.23991238]
aprob: 0.496484592034
action 0
[ 0.00453519 -0.19237387 -0.00873151  0.23493769]
aprob: 0.49615077361
action 1
[ 0.00068771  0.19716114 -0.00403276 -0.34310502]
aprob: 0.493183337228
action 1
[ 0.00463094  0.19714945 -0.01089486 -0.34404082]
aprob: 0.492513262732
action 0
resetting env. episode reward total was 19.000000. running mean: 15.040000
[ 0.  0.  0.  0.]
aprob: 0.5
action 1
[ 0.00056074  0.19571025 -0.00097565 -0.30590929]
aprob: 0.493826794203
action 0
[ 0.00447494 -0.19447124 -0.00709384  0.27852506]
aprob: 0.492834178879
action 0
[ 0.00058552 -0.19435532 -0.00152334  0.27617004]
aprob: 0.49228261872
action 

action 1
[ 0.00814823  0.19281945 -0.00409458 -0.24440075]
aprob: 0.49361251975
action 1
[ 0.01200462  0.1929132  -0.00898259 -0.24589819]
aprob: 0.49296994497
action 1
[ 0.01586288  0.19310461 -0.01390056 -0.24914813]
aprob: 0.49230369855
action 0
[ 0.01972497 -0.19647698 -0.01888352  0.3267506 ]
aprob: 0.490657951335
action 0
[ 0.01579543 -0.19633437 -0.01234851  0.32160912]
aprob: 0.490254252521
action 0
[ 0.01186875 -0.19622115 -0.00591632  0.31815455]
aprob: 0.489739736395
action 0
[ 0.00794432 -0.19615848  0.00044677  0.31646947]
aprob: 0.489089136296
action 1
[ 0.00402115  0.19391004  0.00677616 -0.2666762 ]
aprob: 0.494288183585
action 0
[ 0.00789936 -0.1962432   0.00144263  0.31853746]
aprob: 0.488808874303
action 1
[ 0.00397449  0.19377867  0.00781338 -0.26387292]
aprob: 0.494409057335
action 1
[ 0.00785006  0.19363693  0.00253592 -0.26116649]
aprob: 0.493712130783
action 0
[ 0.0117228  -0.19637713 -0.00268741  0.32187093]
aprob: 0.489040514767
action 1
[ 0.00779526  0.193641

[-0.0231417  -0.19667506  0.02853346  0.33737362]
aprob: 0.484387751929
action 0
resetting env. episode reward total was 36.000000. running mean: 15.803878
[ 0.  0.  0.  0.]
aprob: nan
action 0
[-0.00065949 -0.19457056  0.00071615  0.28074469]
aprob: nan
action 0
[-0.0045509  -0.19458506  0.00633105  0.28098451]
aprob: nan
action 0
[-0.0084426  -0.19468855  0.01195074  0.28306138]
aprob: nan
action 0
[-0.01233637 -0.19487117  0.01761197  0.28693183]
aprob: nan
action 0
[-0.01623379 -0.19511534  0.0233506   0.29252365]
aprob: nan
action 0
[-0.0201361  -0.19539447  0.02920108  0.29973042]
aprob: nan
action 0
[-0.02404399 -0.19567107  0.03519568  0.30840318]
aprob: nan
action 0
[-0.02795741 -0.19589435  0.04136375  0.31833994]
aprob: nan
action 0
[-0.0318753  -0.19599753  0.04773055  0.32927369]
aprob: nan
action 0
resetting env. episode reward total was 10.000000. running mean: 15.745840
[ 0.  0.  0.  0.]
aprob: nan
action 0
[ -4.57913877e-05  -1.95670289e-01   2.13233219e-04   3.0498873



aprob: nan
action 0
[-0.0039592  -0.1956692   0.00631301  0.30504725]
aprob: nan
action 0
[-0.00787258 -0.19573805  0.01241395  0.30692462]
aprob: nan
action 0
[-0.01178734 -0.19586449  0.01855245  0.31056697]
aprob: nan
action 0
[-0.01570463 -0.19602567  0.02476378  0.31587895]
aprob: nan
action 0
[-0.01962515 -0.19618695  0.03108136  0.32271814]
aprob: nan
action 0
[-0.02354888 -0.19630001  0.03753573  0.33088694]
aprob: nan
action 0
resetting env. episode reward total was 8.000000. running mean: 15.668381
[ 0.  0.  0.  0.]
aprob: nan
action 0
[ -4.90906753e-04  -1.94598968e-01   1.89427504e-04   2.81352138e-01]
aprob: nan
action 0
[-0.00438289 -0.19460476  0.00581647  0.28141842]
aprob: nan
action 0
[-0.00827498 -0.19469977  0.01144484  0.28332463]
aprob: nan
action 0
[-0.01216898 -0.19487476  0.01711133  0.28702956]
aprob: nan
action 0
[-0.01606647 -0.1951128   0.02285192  0.29246337]
aprob: nan
action 0
[-0.01996873 -0.19538802  0.02870119  0.2995223 ]
aprob: nan
action 0
[-0.0238

[-0.02660396 -0.19618073  0.04170717  0.33017955]
aprob: nan
action 0
resetting env. episode reward total was 9.000000. running mean: 14.910684
[ 0.  0.  0.  0.]
aprob: nan
action 0
[  3.19103949e-04  -1.95639113e-01  -1.75387692e-04   3.04274458e-01]
aprob: nan
action 0
[-0.00359368 -0.19563362  0.0059101   0.30421662]
aprob: nan
action 0
[-0.00750635 -0.1956993   0.01199443  0.30598137]
aprob: nan
action 0
[-0.01142034 -0.19582456  0.01811406  0.30951777]
aprob: nan
action 0
[-0.01533683 -0.19598744  0.02430442  0.3147341 ]
aprob: nan
action 0
[-0.01925658 -0.19615441  0.0305991   0.32149247]
aprob: nan
action 0
[-0.02317966 -0.19627845  0.03702895  0.3296008 ]
aprob: nan
action 0
[-0.02710523 -0.19629698  0.04362096  0.33880293]
aprob: nan
action 0
resetting env. episode reward total was 9.000000. running mean: 14.851577
[ 0.  0.  0.  0.]
aprob: nan
action 0
[-0.00068016 -0.19533676  0.00071858  0.29744352]
aprob: nan
action 0
[-0.00458689 -0.19534503  0.00666745  0.29766389]
aprob:

[-0.01210066 -0.19546837  0.01696967  0.30078295]
aprob: nan
action 0
[-0.01601003 -0.19565716  0.02298533  0.30587974]
aprob: nan
action 0
[-0.01992318 -0.195868    0.02910292  0.31257471]
aprob: nan
action 0
[-0.02384054 -0.19605948  0.03535442  0.32070011]
aprob: nan
action 0
[-0.02776172 -0.19617578  0.04176842  0.33002968]
aprob: nan
action 0
resetting env. episode reward total was 9.000000. running mean: 14.213958
[ 0.  0.  0.  0.]
aprob: nan
action 0
[  7.95707270e-04  -1.95188932e-01  -1.98228143e-04   2.94159747e-01]
aprob: nan
action 0
[-0.00310807 -0.19518576  0.00568497  0.29409698]
aprob: nan
action 0
[-0.00701179 -0.19526312  0.01156691  0.29587214]
aprob: nan
action 0
[-0.01091705 -0.19541075  0.01748435  0.29943967]
aprob: nan
action 0
[-0.01482526 -0.19560927  0.02347314  0.30471894]
aprob: nan
action 0
[-0.01873745 -0.19582898  0.02956752  0.31158897]
aprob: nan
action 0
[-0.02265403 -0.19602797  0.0357993   0.31988033]
aprob: nan
action 0
[-0.02657459 -0.19614985  0.

aprob: nan
action 0
[-0.02797641 -0.19628045  0.04366996  0.337957  ]
aprob: nan
action 0
resetting env. episode reward total was 9.000000. running mean: 13.649462
[ 0.  0.  0.  0.]
aprob: nan
action 0
[ -4.66776456e-04  -1.95680341e-01   1.59882399e-05   3.05219356e-01]
aprob: nan
action 0
[-0.00438038 -0.19567684  0.00612038  0.30521861]
aprob: nan
action 0
[-0.00829392 -0.19574329  0.01222475  0.30703737]
aprob: nan
action 0
[-0.01220879 -0.19586766  0.01836549  0.31062292]
aprob: nan
action 0
[-0.01612614 -0.19602738  0.02457795  0.31588109]
aprob: nan
action 0
[-0.02004669 -0.19618816  0.03089558  0.32267076]
aprob: nan
action 0
[-0.02397045 -0.19630204  0.03734899  0.33079579]
aprob: nan
action 0
resetting env. episode reward total was 8.000000. running mean: 13.592967
[ 0.  0.  0.  0.]
aprob: nan
action 0
[ -5.39512374e-04  -1.95800877e-01   2.10048394e-05   3.08002817e-01]
aprob: nan
action 0
[-0.00445553 -0.1957965   0.00618106  0.30800211]
aprob: nan
action 0
[-0.00837146 -0.

[-0.01950332 -0.195555    0.02842546  0.30389138]
aprob: nan
action 0
[-0.02341442 -0.19580263  0.03450328  0.31218464]
aprob: nan
action 0
[-0.02733047 -0.19599589  0.04074698  0.32174956]
aprob: nan
action 0
[-0.03125039 -0.19606778  0.04718197  0.33231643]
aprob: nan
action 0
resetting env. episode reward total was 10.000000. running mean: 13.071998
[ 0.  0.  0.  0.]
aprob: nan
action 0
[ 0.00052097 -0.19527546  0.00080888  0.29607782]
aprob: nan
action 0
[-0.00338454 -0.19528553  0.00673044  0.29632806]
aprob: nan
action 0
[-0.00729025 -0.19537324  0.012657    0.29840855]
aprob: nan
action 0
[-0.01119772 -0.19552652  0.01862517  0.3022668 ]
aprob: nan
action 0
[-0.01510825 -0.19572382  0.02467051  0.3078134 ]
aprob: nan
action 0
[-0.01902273 -0.1959327   0.03082677  0.31491625]
aprob: nan
action 0
[-0.02294138 -0.1961079   0.0371251   0.32339206]
aprob: nan
action 0
[-0.02686354 -0.19618907  0.04359294  0.3329959 ]
aprob: nan
action 0
resetting env. episode reward total was 9.00000

aprob: nan
action 0
[-0.02309261 -0.19622971  0.03597374  0.32696846]
aprob: nan
action 0
[-0.0270172  -0.19628268  0.04251311  0.3360869 ]
aprob: nan
action 0
resetting env. episode reward total was 9.000000. running mean: 12.636670
[ 0.  0.  0.  0.]
aprob: nan
action 0
[-0.00029615 -0.19472771  0.00055199  0.2841112 ]
aprob: nan
action 0
[-0.00419071 -0.1947385   0.00623421  0.28429388]
aprob: nan
action 0
[-0.00808548 -0.19483636  0.01192009  0.28631511]
aprob: nan
action 0
[-0.0119822  -0.19501122  0.01764639  0.29013046]
aprob: nan
action 0
[-0.01588243 -0.19524494  0.023449    0.29566518]
aprob: nan
action 0
[-0.01978733 -0.19551005  0.02936231  0.30280874]
aprob: nan
action 0
[-0.02369753 -0.19576778  0.03541848  0.31140648]
aprob: nan
action 0
[-0.02761288 -0.19596578  0.04164661  0.3212492 ]
aprob: nan
action 0
[-0.0315322  -0.19603542  0.04807159  0.33206126]
aprob: nan
action 0
resetting env. episode reward total was 10.000000. running mean: 12.610303
[ 0.  0.  0.  0.]
aprob

[ 0.0007697  -0.19485838  0.00032354  0.28693115]
aprob: nan
action 0
[-0.00312747 -0.19486477  0.00606216  0.28703753]
aprob: nan
action 0
[-0.00702477 -0.19495659  0.01180291  0.28898338]
aprob: nan
action 0
[-0.0109239  -0.19512373  0.01758258  0.29272398]
aprob: nan
action 0
[-0.01482637 -0.19534772  0.02343706  0.29818301]
aprob: nan
action 0
[-0.01873333 -0.19560049  0.02940072  0.30524711]
aprob: nan
action 0
[-0.02264534 -0.19584245  0.03550566  0.31375766]
aprob: nan
action 0
[-0.02656219 -0.19602015  0.04178082  0.32350037]
aprob: nan
action 0
[-0.03048259 -0.1960637   0.04825082  0.3341936 ]
aprob: nan
action 0
resetting env. episode reward total was 10.000000. running mean: 12.239232
[ 0.  0.  0.  0.]
aprob: nan
action 0
[ -1.15680921e-04  -1.95527376e-01   6.39933788e-04   3.01730926e-01]
aprob: nan
action 0
[-0.00402623 -0.19553279  0.00667455  0.30192158]
aprob: nan
action 0
[-0.00793688 -0.19561085  0.01271298  0.30393501]
aprob: nan
action 0
[-0.0118491  -0.19574896  0

[-0.03068202 -0.19604964  0.04811398  0.33301501]
aprob: nan
action 0
resetting env. episode reward total was 10.000000. running mean: 11.967279
[ 0.  0.  0.  0.]
aprob: nan
action 0
[-0.00041814 -0.19575834  0.00039726  0.30701738]
aprob: nan
action 0
[-0.0043333  -0.1957586   0.00653761  0.30712898]
aprob: nan
action 0
[-0.00824848 -0.19582638  0.01268019  0.30905365]
aprob: nan
action 0
[-0.012165   -0.19594875  0.01886126  0.31273514]
aprob: nan
action 0
[-0.01608398 -0.19610196  0.02511596  0.31807438]
aprob: nan
action 0
[-0.02000602 -0.19625019  0.03147745  0.32492386]
aprob: nan
action 0
[-0.02393102 -0.19634362  0.03797593  0.33307943]
aprob: nan
action 0
resetting env. episode reward total was 8.000000. running mean: 11.927606
[ 0.  0.  0.  0.]
aprob: nan
action 0
[ -4.35447746e-04  -1.94890208e-01   2.75285590e-04   2.87620919e-01]
aprob: nan
action 0
[-0.00433325 -0.19489566  0.0060277   0.28771129]
aprob: nan
action 0
[-0.00823117 -0.19498611  0.01178193  0.28964121]
aprob

[-0.03175739 -0.19600613  0.04686683  0.32841839]
aprob: nan
action 0
resetting env. episode reward total was 10.000000. running mean: 11.687433
[ 0.  0.  0.  0.]
aprob: nan
action 0
[  4.46179043e-04  -1.95644092e-01   1.57524830e-04   3.04388384e-01]
aprob: nan
action 0
[-0.0034667  -0.19564256  0.00624529  0.30443061]
aprob: nan
action 0
[-0.00737955 -0.19571166  0.0123339   0.3062933 ]
aprob: nan
action 0
[-0.01129379 -0.19583922  0.01845977  0.30992329]
aprob: nan
action 0
[-0.01521057 -0.19600265  0.02465824  0.31522631]
aprob: nan
action 0
[-0.01913062 -0.19616766  0.03096276  0.32206147]
aprob: nan
action 0
[-0.02305398 -0.19628638  0.03740399  0.33023306]
aprob: nan
action 0
resetting env. episode reward total was 8.000000. running mean: 11.650559
[ 0.  0.  0.  0.]
aprob: nan
action 0
[ 0.00081167 -0.19562778 -0.00076093  0.30401633]
aprob: nan
action 0
[-0.00310088 -0.19561526  0.0053194   0.3037821 ]
aprob: nan
action 0
[-0.00701319 -0.19567488  0.01139504  0.30537405]
aprob

[-0.03172227 -0.19600037  0.04699965  0.32830954]
aprob: nan
action 0
resetting env. episode reward total was 10.000000. running mean: 11.450414
[ 0.  0.  0.  0.]
aprob: nan
action 0
[  1.46406373e-04  -1.94730132e-01  -5.34387396e-04   2.84163269e-01]
aprob: nan
action 0
[-0.0037482  -0.19472381  0.00514888  0.28399254]
aprob: nan
action 0
[-0.00764267 -0.1948057   0.01082873  0.28566564]
aprob: nan
action 0
[-0.01153879 -0.19496721  0.01654204  0.2891436 ]
aprob: nan
action 0
[-0.01543813 -0.19519183  0.02232491  0.29435792]
aprob: nan
action 0
[-0.01934197 -0.19545404  0.02821207  0.30120561]
aprob: nan
action 0
[-0.02325105 -0.19571746  0.03423618  0.30954146]
aprob: nan
action 0
[-0.0271654  -0.19593261  0.04042701  0.31916787]
aprob: nan
action 0
[-0.03108405 -0.19603424  0.04681037  0.3298232 ]
aprob: nan
action 0
resetting env. episode reward total was 10.000000. running mean: 11.435910
[ 0.  0.  0.  0.]
aprob: nan
action 0
[-0.00042089 -0.1948638  -0.00072465  0.28704805]
apro

[-0.02380505 -0.19556399  0.03381801  0.30496144]
aprob: nan
action 0
[-0.02771633 -0.19581493  0.03991724  0.31469504]
aprob: nan
action 0
[-0.03163262 -0.19596509  0.04621114  0.32550013]
aprob: nan
action 0
resetting env. episode reward total was 10.000000. running mean: 11.256509
[ 0.  0.  0.  0.]
aprob: nan
action 0
[-0.00094032 -0.19543616  0.0003626   0.29967093]
aprob: nan
action 0
[-0.00484904 -0.19543888  0.00635602  0.29977919]
aprob: nan
action 0
[-0.00875782 -0.1955166   0.0123516   0.30171539]
aprob: nan
action 0
[-0.01266815 -0.19565745  0.01838591  0.30542768]
aprob: nan
action 0
[-0.0165813  -0.19583971  0.02449447  0.31082567]
aprob: nan
action 0
[-0.02049809 -0.19603048  0.03071098  0.31777479]
aprob: nan
action 0
[-0.0244187  -0.19618377  0.03706647  0.32608801]
aprob: nan
action 0
[-0.02834238 -0.19623828  0.04358824  0.33551561]
aprob: nan
action 0
resetting env. episode reward total was 9.000000. running mean: 11.233944
[ 0.  0.  0.  0.]
aprob: nan
action 0
[ -1.

aprob: nan
action 0
[-0.02260975 -0.19555548  0.03363318  0.30468451]
aprob: nan
action 0
[-0.02652086 -0.19580842  0.03972687  0.31438074]
aprob: nan
action 0
[-0.03043703 -0.19596276  0.04601449  0.32515712]
aprob: nan
action 0
resetting env. episode reward total was 10.000000. running mean: 11.057411
[ 0.  0.  0.  0.]
aprob: nan
action 0
[ -9.36420278e-04  -1.95488140e-01   3.49851710e-05   3.00842578e-01]
aprob: nan
action 0
[-0.00484618 -0.19548627  0.00605184  0.30084979]
aprob: nan
action 0
[-0.00875591 -0.1955587   0.01206883  0.30268488]
aprob: nan
action 0
[-0.01266708 -0.19569396  0.01812253  0.30629742]
aprob: nan
action 0
[-0.01658096 -0.19587058  0.02424848  0.311598  ]
aprob: nan
action 0
[-0.02049837 -0.1960559   0.03048044  0.31845275]
aprob: nan
action 0
[-0.02441949 -0.19620414  0.03684949  0.32667523]
aprob: nan
action 0
[-0.02834357 -0.19625419  0.043383    0.33601633]
aprob: nan
action 0
resetting env. episode reward total was 9.000000. running mean: 11.036837
[ 0

action 0
resetting env. episode reward total was 8.000000. running mean: 10.851136
[ 0.  0.  0.  0.]
aprob: nan
action 0
[-0.00058365 -0.19468693  0.00062691  0.28323509]
aprob: nan
action 0
[-0.00447739 -0.19469919  0.00629161  0.28344304]
aprob: nan
action 0
[-0.00837137 -0.19479899  0.01196047  0.28548907]
aprob: nan
action 0
[-0.01226735 -0.19497628  0.01767025  0.28932884]
aprob: nan
action 0
[-0.01616687 -0.19521302  0.02345683  0.29488809]
aprob: nan
action 0
[-0.02007114 -0.1954819   0.02935459  0.30205711]
aprob: nan
action 0
[-0.02398077 -0.19574443  0.03539573  0.31068246]
aprob: nan
action 0
[-0.02789566 -0.19594856  0.04160938  0.32055646]
aprob: nan
action 0
[-0.03181463 -0.19602605  0.04802051  0.3314053 ]
aprob: nan
action 0
resetting env. episode reward total was 10.000000. running mean: 10.842625
[ 0.  0.  0.  0.]
aprob: nan
action 0
[ -1.93690690e-04  -1.95129649e-01  -1.73989370e-04   2.92852326e-01]
aprob: nan
action 0
[-0.00409628 -0.19512713  0.00568306  0.292797

[-0.01626776 -0.19595408  0.02477754  0.313937  ]
aprob: nan
action 0
[-0.02018684 -0.19612731  0.03105628  0.32085134]
aprob: nan
action 0
[-0.02410939 -0.19625569  0.03747331  0.32910533]
aprob: nan
action 0
[-0.0280345  -0.19627589  0.04405542  0.33844017]
aprob: nan
action 0
resetting env. episode reward total was 9.000000. running mean: 10.623176
[ 0.  0.  0.  0.]
aprob: nan
action 0
[ 0.00087723 -0.19574318 -0.00059941  0.30666742]
aprob: nan
action 0
[-0.00303764 -0.19573209  0.00553394  0.30648269]
aprob: nan
action 0
[-0.00695228 -0.19579019  0.0116636   0.30811754]
aprob: nan
action 0
[-0.01086808 -0.19590628  0.01782595  0.31152256]
aprob: nan
action 0
[-0.01478621 -0.19605858  0.0240564   0.31660648]
aprob: nan
action 0
[-0.01870738 -0.19621351  0.03038853  0.323231  ]
aprob: nan
action 0
[-0.02263165 -0.19632393  0.03685315  0.33120288]
aprob: nan
action 0
resetting env. episode reward total was 8.000000. running mean: 10.596944
[ 0.  0.  0.  0.]
aprob: nan
action 0
[  4.2

[-0.01949845 -0.19601258  0.03114628  0.31739457]
aprob: nan
action 0
[-0.0234187  -0.19616842  0.03749417  0.32583729]
aprob: nan
action 0
[-0.02734207 -0.19622247  0.04401092  0.33538046]
aprob: nan
action 0
resetting env. episode reward total was 9.000000. running mean: 10.486339
[ 0.  0.  0.  0.]
aprob: nan
action 0
[  3.85081980e-04  -1.94931945e-01  -1.76535161e-05   2.88527478e-01]
aprob: nan
action 0
[-0.00351356 -0.19493274  0.0057529   0.28852342]
aprob: nan
action 0
[-0.00741221 -0.19501823  0.01152336  0.2903601 ]
aprob: nan
action 0
[-0.01131258 -0.19517857  0.01733057  0.29399378]
aprob: nan
action 0
[-0.01521615 -0.19539545  0.02321044  0.29934858]
aprob: nan
action 0
[-0.01912406 -0.19564088  0.02919741  0.3063111 ]
aprob: nan
action 0
[-0.02303687 -0.19587526  0.03532364  0.3147224 ]
aprob: nan
action 0
[-0.02695438 -0.19604511  0.04161808  0.32436768]
aprob: nan
action 0
[-0.03087528 -0.1960805   0.04810544  0.33496466]
aprob: nan
action 0
resetting env. episode rewar

[-0.01130295 -0.1951669   0.01825309  0.29372899]
aprob: nan
action 0
[-0.01520628 -0.19539414  0.02412767  0.2993691 ]
aprob: nan
action 0
[-0.01911417 -0.19564568  0.03011505  0.30659744]
aprob: nan
action 0
[-0.02302708 -0.19588028  0.036247    0.31524873]
aprob: nan
action 0
[-0.02694469 -0.19604251  0.04255197  0.32510051]
aprob: nan
action 0
[-0.03086554 -0.19606018  0.04905398  0.33586139]
aprob: nan
action 0
resetting env. episode reward total was 10.000000. running mean: 10.402182
[ 0.  0.  0.  0.]
aprob: nan
action 0
[-0.00086575 -0.19505111 -0.00063995  0.29112779]
aprob: nan
action 0
[-0.00476677 -0.19504219  0.00518261  0.2909253 ]
aprob: nan
action 0
[-0.00866761 -0.1951167   0.01100111  0.29256523]
aprob: nan
action 0
[-0.01256995 -0.19526538  0.01685242  0.296006  ]
aprob: nan
action 0
[-0.01647525 -0.19547031  0.02277254  0.30117288]
aprob: nan
action 0
[-0.02038466 -0.19570371  0.028796    0.30795303]
aprob: nan
action 0
[-0.02429873 -0.19592618  0.03495506  0.3161876

[-0.01559658 -0.19517028  0.02365792  0.29385281]
aprob: nan
action 0
[-0.01949999 -0.19544566  0.02953498  0.30111233]
aprob: nan
action 0
[-0.0234089  -0.19571485  0.03555723  0.30982573]
aprob: nan
action 0
[-0.0273232  -0.19592583  0.04175374  0.31978586]
aprob: nan
action 0
[-0.03124171 -0.19601043  0.04814946  0.33071948]
aprob: nan
action 0
resetting env. episode reward total was 10.000000. running mean: 10.290556
[ 0.  0.  0.  0.]
aprob: nan
action 0
[  2.95193672e-04  -1.95238123e-01   6.83993713e-04   2.95248750e-01]
aprob: nan
action 0
[-0.00360957 -0.19524684  0.00658897  0.2954612 ]
aprob: nan
action 0
[-0.00751451 -0.19533407  0.01249819  0.29750561]
aprob: nan
action 0
[-0.01142119 -0.19548808  0.0184483   0.30133069]
aprob: nan
action 0
[-0.01533095 -0.19568772  0.02447492  0.30684882]
aprob: nan
action 0
[-0.0192447  -0.19590114  0.0306119   0.31393031]
aprob: nan
action 0
[-0.02316273 -0.19608379  0.0368905   0.32239492]
aprob: nan
action 0
[-0.0270844  -0.19617618  0

aprob: nan
action 0
[-0.02727687 -0.19577952  0.04049654  0.31373889]
aprob: nan
action 0
[-0.03119246 -0.19593566  0.04677132  0.32474592]
aprob: nan
action 0
resetting env. episode reward total was 10.000000. running mean: 10.182451
[ 0.  0.  0.  0.]
aprob: nan
action 0
[ -3.78586258e-05  -1.94837554e-01  -8.52095207e-04   2.86479729e-01]
aprob: nan
action 0
[-0.00393461 -0.194826    0.0048775   0.28620699]
aprob: nan
action 0
[-0.00783113 -0.19490145  0.01060164  0.28777937]
aprob: nan
action 0
[-0.01172916 -0.19505544  0.01635723  0.29115848]
aprob: nan
action 0
[-0.01563027 -0.19527147  0.0221804   0.29627544]
aprob: nan
action 0
[-0.0195357  -0.19552377  0.0281059   0.30302602]
aprob: nan
action 0
[-0.02344617 -0.1957756   0.03416643  0.31126299]
aprob: nan
action 0
[-0.02736168 -0.19597694  0.04039169  0.3207861 ]
aprob: nan
action 0
[-0.03128122 -0.19606192  0.04680741  0.3313305 ]
aprob: nan
action 0
resetting env. episode reward total was 10.000000. running mean: 10.180627
[ 

[  2.09695097e-04  -1.95631917e-01  -7.57303642e-06   3.04109810e-01]
aprob: nan
action 0
[-0.00370294 -0.19562849  0.00607462  0.30410253]
aprob: nan
action 0
[-0.00761551 -0.1956962   0.01215667  0.30591725]
aprob: nan
action 0
[-0.01152944 -0.19582318  0.01827502  0.30950203]
aprob: nan
action 0
[-0.0154459  -0.19598723  0.02446506  0.3147641 ]
aprob: nan
action 0
[-0.01936565 -0.1961545   0.03076034  0.32156445]
aprob: nan
action 0
[-0.02328874 -0.19627767  0.03719163  0.32970968]
aprob: nan
action 0
resetting env. episode reward total was 8.000000. running mean: 10.046478
[ 0.  0.  0.  0.]
aprob: nan
action 0
[ -2.35336456e-04  -1.94488351e-01  -6.38735950e-04   2.78994147e-01]
aprob: nan
action 0
[-0.0041251  -0.19448097  0.00494115  0.2787875 ]
aprob: nan
action 0
[-0.00801472 -0.19456502  0.0105169   0.28042351]
aprob: nan
action 0
[-0.01190602 -0.19473258  0.01612537  0.28386596]
aprob: nan
action 0
[-0.01580067 -0.19496846  0.02180269  0.2890518 ]
aprob: nan
action 0
[-0.0197

aprob: nan
action 0
[-0.0224747  -0.19611244  0.03563184  0.3226369 ]
aprob: nan
action 0
[-0.02639695 -0.19620936  0.04208457  0.33192548]
aprob: nan
action 0
resetting env. episode reward total was 9.000000. running mean: 9.945860
[ 0.  0.  0.  0.]
aprob: nan
action 0
[ -3.61553433e-04  -1.94864110e-01   1.79327952e-04   2.87055225e-01]
aprob: nan
action 0
[-0.00425884 -0.19486826  0.00592043  0.28711509]
aprob: nan
action 0
[-0.0081562  -0.19495793  0.01166273  0.28901515]
aprob: nan
action 0
[-0.01205536 -0.19512318  0.01744304  0.29271137]
aprob: nan
action 0
[-0.01595782 -0.19534575  0.02329726  0.29812818]
aprob: nan
action 0
[-0.01986474 -0.19559779  0.02925983  0.30515307]
aprob: nan
action 0
[-0.02377669 -0.19583997  0.03536289  0.31362845]
aprob: nan
action 0
[-0.02769349 -0.19601917  0.04163546  0.32334131]
aprob: nan
action 0
[-0.03161388 -0.19606587  0.04810229  0.33401149]
aprob: nan
action 0
resetting env. episode reward total was 10.000000. running mean: 9.946401
[ 0. 

aprob: nan
action 0
resetting env. episode reward total was 10.000000. running mean: 9.867132
[ 0.  0.  0.  0.]
aprob: nan
action 0
[  9.54163597e-04  -1.94648057e-01  -6.44394085e-05   2.82402246e-01]
aprob: nan
action 0
[-0.0029388  -0.19464952  0.00558361  0.2823849 ]
aprob: nan
action 0
[-0.00683179 -0.19473983  0.0112313   0.28420898]
aprob: nan
action 0
[-0.01072658 -0.19490996  0.01691548  0.28783401]
aprob: nan
action 0
[-0.01462478 -0.19514311  0.02267216  0.29319053]
aprob: nan
action 0
[-0.01852765 -0.19541352  0.02853597  0.30017496]
aprob: nan
action 0
[-0.02243592 -0.19568461  0.03453947  0.30864164]
aprob: nan
action 0
[-0.02634961 -0.19590668  0.04071231  0.31839258]
aprob: nan
action 0
[-0.03026774 -0.19601425  0.04708016  0.32916564]
aprob: nan
action 0
resetting env. episode reward total was 10.000000. running mean: 9.868461
[ 0.  0.  0.  0.]
aprob: nan
action 0
[  1.89877073e-06  -1.95490925e-01   7.61234436e-04   3.00906296e-01]
aprob: nan
action 0
[-0.00390792 -0.

[  2.62171493e-05  -1.94587348e-01  -2.87734092e-04   2.81103844e-01]
aprob: nan
action 0
[-0.00386553 -0.19458545  0.00533434  0.28101342]
aprob: nan
action 0
[-0.00775724 -0.19467342  0.01095461  0.2827651 ]
aprob: nan
action 0
[-0.01165071 -0.19484267  0.01660991  0.28632007]
aprob: nan
action 0
[-0.01554756 -0.19507703  0.02233631  0.2916114 ]
aprob: nan
action 0
[-0.0194491  -0.19535158  0.02816854  0.29853897]
aprob: nan
action 0
[-0.02335613 -0.19563081  0.03413932  0.30696171]
aprob: nan
action 0
[-0.02726875 -0.1958664   0.04027856  0.3166874 ]
aprob: nan
action 0
[-0.03118608 -0.19599446  0.0466123   0.32746088]
aprob: nan
action 0
resetting env. episode reward total was 10.000000. running mean: 9.818892
[ 0.  0.  0.  0.]
aprob: nan
action 0
[  2.51940960e-04  -1.95500078e-01  -7.46263049e-04   3.01113149e-01]
aprob: nan
action 0
[-0.00365806 -0.19548821  0.005276    0.30088188]
aprob: nan
action 0
[-0.00756782 -0.19555131  0.01129364  0.30248211]
aprob: nan
action 0
[-0.0114

aprob: nan
action 0
[-0.02297821 -0.19605717  0.03674756  0.32138563]
aprob: nan
action 0
[-0.02689936 -0.19616015  0.04317527  0.33101656]
aprob: nan
action 0
resetting env. episode reward total was 9.000000. running mean: 9.726699
[ 0.  0.  0.  0.]
aprob: nan
action 0
[ 0.00030172 -0.19437585 -0.00046464  0.27660919]
aprob: nan
action 0
[-0.0035858  -0.19437167  0.00506755  0.27645975]
aprob: nan
action 0
[-0.00747323 -0.19446006  0.01059674  0.27815065]
aprob: nan
action 0
[-0.01136244 -0.19463312  0.01615976  0.28164592]
aprob: nan
action 0
[-0.0152551  -0.19487593  0.02179267  0.2868838 ]
aprob: nan
action 0
[-0.01915262 -0.19516536  0.02753035  0.29377188]
aprob: nan
action 0
[-0.02305592 -0.19546835  0.03340579  0.30217956]
aprob: nan
action 0
[-0.02696529 -0.19573961  0.03944938  0.31192805]
aprob: nan
action 0
[-0.03088008 -0.19591895  0.04568794  0.32277848]
aprob: nan
action 0
[-0.03479846 -0.19592861  0.05214351  0.33441949]
aprob: nan
action 0
resetting env. episode reward

aprob: nan
action 0
resetting env. episode reward total was 9.000000. running mean: 9.680776
[ 0.  0.  0.  0.]
aprob: nan
action 0
[  3.37684177e-04  -1.95224447e-01  -2.41707258e-04   2.94945502e-01]
aprob: nan
action 0
[-0.0035668  -0.19522049  0.0056572   0.294869  ]
aprob: nan
action 0
[-0.00747121 -0.19529646  0.01155458  0.29662988]
aprob: nan
action 0
[-0.01137714 -0.19544206  0.01748718  0.30018241]
aprob: nan
action 0
[-0.01528598 -0.19563779  0.02349083  0.30544545]
aprob: nan
action 0
[-0.01919874 -0.19585376  0.02959974  0.31229707]
aprob: nan
action 0
[-0.02311582 -0.19604776  0.03584568  0.32056655]
aprob: nan
action 0
[-0.02703677 -0.19616307  0.04225701  0.33002418]
aprob: nan
action 0
resetting env. episode reward total was 9.000000. running mean: 9.673968
[ 0.  0.  0.  0.]
aprob: nan
action 0
[ 0.00050917 -0.19474469 -0.00051286  0.28447642]
aprob: nan
action 0
[-0.00338573 -0.19473866  0.00517667  0.28431277]
aprob: nan
action 0
[-0.0072805  -0.19482062  0.01086292  

aprob: nan
action 0
[-0.02763378 -0.19614695  0.04239684  0.32946276]
aprob: nan
action 0
resetting env. episode reward total was 9.000000. running mean: 9.660605
[ 0.  0.  0.  0.]
aprob: nan
action 0
[ 0.000686   -0.19531344 -0.00062185  0.2969231 ]
aprob: nan
action 0
[-0.00322027 -0.19530388  0.00531661  0.29672836]
aprob: nan
action 0
[-0.00712635 -0.19537298  0.01125118  0.29837064]
aprob: nan
action 0
[-0.01103381 -0.19551079  0.01721859  0.30180549]
aprob: nan
action 0
[-0.01494402 -0.19569796  0.0232547   0.30695217]
aprob: nan
action 0
[-0.01885798 -0.19590462  0.02939374  0.31368856]
aprob: nan
action 0
[-0.02277607 -0.1960885   0.03566752  0.32184323]
aprob: nan
action 0
[-0.02669784 -0.19619272  0.04210438  0.33118542]
aprob: nan
action 0
resetting env. episode reward total was 9.000000. running mean: 9.653999
[ 0.  0.  0.  0.]
aprob: nan
action 0
[ 0.00045236 -0.19506732  0.00064758  0.29148314]
aprob: nan
action 0
[-0.00344899 -0.19507699  0.00647724  0.29168869]
aprob: n

[-0.00731575 -0.19470471  0.01074073  0.28345514]
aprob: nan
action 0
[-0.01120984 -0.19486953  0.01640983  0.28693286]
aprob: nan
action 0
[-0.01510723 -0.19509959  0.02214849  0.29214965]
aprob: nan
action 0
[-0.01900922 -0.1953701   0.02799148  0.2990058 ]
aprob: nan
action 0
[-0.02291662 -0.19564572  0.0339716   0.3073606 ]
aprob: nan
action 0
[-0.02682954 -0.19587825  0.04011881  0.31702221]
aprob: nan
action 0
[-0.0307471  -0.19600399  0.04645925  0.32773597]
aprob: nan
action 0
resetting env. episode reward total was 10.000000. running mean: 9.590540
[ 0.  0.  0.  0.]
aprob: nan
action 0
[  2.41661695e-04  -1.94711204e-01  -6.09530079e-04   2.83756271e-01]
aprob: nan
action 0
[-0.00365256 -0.19470374  0.0050656   0.28356094]
aprob: nan
action 0
[-0.00754664 -0.19478484  0.01073681  0.28520973]
aprob: nan
action 0
[-0.01144233 -0.19494602  0.01644101  0.28866424]
aprob: nan
action 0
[-0.01534125 -0.19517101  0.02221429  0.29385676]
aprob: nan
action 0
[-0.01924467 -0.19543454  0.

aprob: nan
action 0
[-0.01576059 -0.19575245  0.0234328   0.30837651]
aprob: nan
action 0
[-0.01967564 -0.19595234  0.02960033  0.31511609]
aprob: nan
action 0
[-0.02359468 -0.19612601  0.03590265  0.3232626 ]
aprob: nan
action 0
[-0.0275172  -0.19621556  0.04236791  0.33258082]
aprob: nan
action 0
resetting env. episode reward total was 9.000000. running mean: 9.551430
[ 0.  0.  0.  0.]
aprob: nan
action 0
[ -7.92754282e-04  -1.94981021e-01  -2.03561023e-05   2.89596168e-01]
aprob: nan
action 0
[-0.00469237 -0.19498152  0.00577157  0.28959087]
aprob: nan
action 0
[-0.00859201 -0.19506592  0.01156338  0.29142586]
aprob: nan
action 0
[-0.01249332 -0.19522426  0.0173919   0.29505689]
aprob: nan
action 0
[-0.01639781 -0.19543799  0.02329304  0.30040697]
aprob: nan
action 0
[-0.02030657 -0.19567872  0.02930118  0.30736106]
aprob: nan
action 0
[-0.02422014 -0.19590636  0.0354484   0.31575794]
aprob: nan
action 0
[-0.02813827 -0.19606678  0.04176356  0.32537995]
aprob: nan
action 0
[-0.03205

KeyboardInterrupt: 

## Exploring CartPole & Pong Environments

** CartPole Environment **
A pole is attached by an un-actuated joint to a cart, which moves along a frictionless track.The system is controlled by applying a force of +1 or -1 to the cart. The pendulum starts upright, and the goal is to prevent it from falling over. A reward of +1 is provided for every timestep that the pole remains upright. The episode ends when the pole is more than 15 degrees from vertical, or the cart moves more than 2.4 units from the center.

Good general-purpose agents don't need to know the semantics of the observations: they can learn how to map observations to actions to maximize reward without any prior knowledge.

Cartpole-v0 observation's 4 parameters:
[position of cart, velocity of cart, angle of pole, rotation rate of pole]

In [2]:
import gym
env = gym.make('CartPole-v0')
print(env.action_space)
print(env.action_space)
print(env.observation_space)
print(env.observation_space.high)
print(env.observation_space.low)

env = gym.make("Pong-v0")
print(env.action_space)
print(env.unwrapped.get_action_meanings())
print(env.observation_space)
print(env.observation_space.high)
print(env.observation_space.low)


[2017-09-01 11:27:20,265] Making new env: CartPole-v0
[2017-09-01 11:27:20,269] Making new env: Pong-v0


Discrete(2)
Discrete(2)
Box(4,)
[  4.80000000e+00   3.40282347e+38   4.18879020e-01   3.40282347e+38]
[ -4.80000000e+00  -3.40282347e+38  -4.18879020e-01  -3.40282347e+38]
Discrete(6)
['NOOP', 'FIRE', 'RIGHT', 'LEFT', 'RIGHTFIRE', 'LEFTFIRE']
Box(210, 160, 3)
[[[ 255.  255.  255.]
  [ 255.  255.  255.]
  [ 255.  255.  255.]
  ..., 
  [ 255.  255.  255.]
  [ 255.  255.  255.]
  [ 255.  255.  255.]]

 [[ 255.  255.  255.]
  [ 255.  255.  255.]
  [ 255.  255.  255.]
  ..., 
  [ 255.  255.  255.]
  [ 255.  255.  255.]
  [ 255.  255.  255.]]

 [[ 255.  255.  255.]
  [ 255.  255.  255.]
  [ 255.  255.  255.]
  ..., 
  [ 255.  255.  255.]
  [ 255.  255.  255.]
  [ 255.  255.  255.]]

 ..., 
 [[ 255.  255.  255.]
  [ 255.  255.  255.]
  [ 255.  255.  255.]
  ..., 
  [ 255.  255.  255.]
  [ 255.  255.  255.]
  [ 255.  255.  255.]]

 [[ 255.  255.  255.]
  [ 255.  255.  255.]
  [ 255.  255.  255.]
  ..., 
  [ 255.  255.  255.]
  [ 255.  255.  255.]
  [ 255.  255.  255.]]

 [[ 255.  255.  255.]
 

In [16]:
import gym
import time

env = gym.make('CartPole-v0')

done = False
ob = env.reset()
reward_sum = 0  
while True:
    # action = 1   # 1 - RIGHT
    action = 0   # 0 - LEFT
    env.render()
    time.sleep(1)
    ob, reward, done, _ = env.step(action)
    reward_sum += reward
    if done:
        print("Num steps before the pole tips:",reward_sum)
        break
        # Note there's no env.render() here. But the environment still can open window and
        # render if asked by env.monitor: it calls env.render('rgb_array') to record video.
        # Video is not recorded every episode, see capped_cubic_video_schedule for details.


[2017-09-01 15:14:43,581] Making new env: CartPole-v0


Num steps before the pole tips: 10.0


In [14]:
import gym
import time

env = gym.make("Pong-v0")

done = False
ob = env.reset()
reward_sum = 0  
while True:
#    action = 2   # 2 - RIGHT (UP)
    action = 3   # 3 - LEFT (DOWN)

    env.render()
    time.sleep(0.01)
    ob, reward, done, _ = env.step(action)
    reward_sum += reward
    if done:
        print("Total reward:",reward_sum)
        break
        # Note there's no env.render() here. But the environment still can open window and
        # render if asked by env.monitor: it calls env.render('rgb_array') to record video.
        # Video is not recorded every episode, see capped_cubic_video_schedule for details.
        


[2017-09-01 12:35:00,547] Making new env: Pong-v0


Total reward: -21.0


In [22]:
env.close()