In [1]:
from __future__ import division
import argparse

from PIL import Image
import numpy as np
import gym

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten, Convolution2D, Permute
from keras.optimizers import Adam
import keras.backend as K

from rl.agents.dqn import DQNAgent
from rl.policy import LinearAnnealedPolicy, BoltzmannQPolicy, EpsGreedyQPolicy
from rl.memory import SequentialMemory
from rl.core import Processor
from rl.callbacks import FileLogger, ModelIntervalCheckpoint

Using TensorFlow backend.


In [2]:
INPUT_SHAPE = (84, 84)
WINDOW_LENGTH = 4


class AtariProcessor(Processor):
    def process_observation(self, observation):
        assert observation.ndim == 3  # (height, width, channel)
        img = Image.fromarray(observation)
        img = img.resize(INPUT_SHAPE).convert('L')  # resize and convert to grayscale
        processed_observation = np.array(img)
        assert processed_observation.shape == INPUT_SHAPE
        return processed_observation.astype('uint8')  # saves storage in experience memory

    def process_state_batch(self, batch):
        # We could perform this processing step in `process_observation`. In this case, however,
        # we would need to store a `float32` array instead, which is 4x more memory intensive than
        # an `uint8` array. This matters if we store 1M observations.
        processed_batch = batch.astype('float32') / 255.
        return processed_batch

    def process_reward(self, reward):
        return np.clip(reward, -1., 1.)

In [4]:


# Get the environment and extract the number of actions.
env = gym.make("BreakoutDeterministic-v4")
np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n

print "env", env

[2017-08-22 10:54:38,993] Making new env: BreakoutDeterministic-v4


env <TimeLimit<AtariEnv<BreakoutDeterministic-v4>>>


In [5]:
print env.action_space
print env.class_name
print env.close
print env.configure
print env.env
print env.metadata
print env.observation_space
print env.render
print env.reset
print env.reward_range
print env.seed

Discrete(4)
<bound method type.class_name of <class 'gym.wrappers.time_limit.TimeLimit'>>
<bound method TimeLimit.close of <TimeLimit<AtariEnv<BreakoutDeterministic-v4>>>>
<bound method TimeLimit.configure of <TimeLimit<AtariEnv<BreakoutDeterministic-v4>>>>
<AtariEnv<BreakoutDeterministic-v4>>
{'render.modes': ['human', 'rgb_array']}
Box(210, 160, 3)
<bound method TimeLimit.render of <TimeLimit<AtariEnv<BreakoutDeterministic-v4>>>>
<bound method TimeLimit.reset of <TimeLimit<AtariEnv<BreakoutDeterministic-v4>>>>
(-inf, inf)
<bound method TimeLimit.seed of <TimeLimit<AtariEnv<BreakoutDeterministic-v4>>>>
