In [1]:
import gym
from baselines import deepq

def callback(lcl, _glb):
    # stop training if reward exceeds 199
    is_solved = lcl['t'] > 100 and sum(lcl['episode_rewards'][-101:-1]) / 100 >= 199
    return is_solved

Logging to /tmp/openai-2018-09-07-20-25-41-088327


In [2]:
def train_cartpole():
    env = gym.make("CartPole-v0")
    act = deepq.learn(
        env,
        network='mlp',
        lr=1e-3,
        total_timesteps=100000,
        buffer_size=50000,
        exploration_fraction=0.1,
        exploration_final_eps=0.02,
        print_freq=10,
        callback=callback
    )
    print("Saving model to cartpole_model.pkl")
    act.save("cartpole_model.pkl")


In [3]:
def evaluate_cartpole():
    env = gym.make("CartPole-v0")
    act = deepq.load("cartpole_model.pkl")

    while True:
        obs, done = env.reset(), False
        episode_rew = 0
        while not done:
            env.render()
            obs, rew, done, _ = env.step(act(obs[None])[0])
            episode_rew += rew
        print("Episode reward", episode_rew)


In [4]:
def train_mountaincar():
    env = gym.make("MountainCar-v0")
    # Enabling layer_norm here is import for parameter space noise!
    model = deepq.models.mlp([64], layer_norm=True)
    act = deepq.learn(
        env,
        q_func=model,
        lr=1e-3,
        max_timesteps=100000,
        buffer_size=50000,
        exploration_fraction=0.1,
        exploration_final_eps=0.1,
        print_freq=10,
        param_noise=True
    )
    print("Saving model to mountaincar_model.pkl")
    act.save("mountaincar_model.pkl")
      

In [5]:
def evaluate_mountaincar():
    env = gym.make("MountainCar-v0")
    act = deepq.load("mountaincar_model.pkl")

    while True:
        obs, done = env.reset(), False
        episode_rew = 0
        while not done:
            env.render()
            obs, rew, done, _ = env.step(act(obs[None])[0])
            episode_rew += rew
    print("Episode reward", episode_rew)
    

In [6]:
train_cartpole()

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
--------------------------------------
| % time spent exploring  | 97       |
| episodes                | 10       |
| mean 100 episode reward | 24.6     |
| steps                   | 220      |
--------------------------------------
--------------------------------------
| % time spent exploring  | 95       |
| episodes                | 20       |
| mean 100 episode reward | 23       |
| steps                   | 436      |
--------------------------------------


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


--------------------------------------
| % time spent exploring  | 93       |
| episodes                | 30       |
| mean 100 episode reward | 23.3     |
| steps                   | 674      |
--------------------------------------
--------------------------------------
| % time spent exploring  | 91       |
| episodes                | 40       |
| mean 100 episode reward | 22.5     |
| steps                   | 877      |
--------------------------------------
--------------------------------------
| % time spent exploring  | 89       |
| episodes                | 50       |
| mean 100 episode reward | 22.7     |
| steps                   | 1109     |
--------------------------------------
--------------------------------------
| % time spent exploring  | 86       |
| episodes                | 60       |
| mean 100 episode reward | 23.4     |
| steps                   | 1379     |
--------------------------------------
--------------------------------------
| % time spent exploring 

KeyboardInterrupt: 

In [None]:
evaluate_cartpole()

In [None]:
train_mountaincar()

In [None]:
evaluate_mountaincar()