In [54]:
import gym
import keras
import random
from keras import backend as K

[2018-03-15 19:17:06,737] Making new env: BreakoutDeterministic-v4


In [48]:
def to_grayscale(img):
    return np.mean(img, axis=2).astype(np.uint8)


def downsample(img):
    return img[::2, ::2]


def preprocess(img):
    return to_grayscale(downsample(img))


def transform_reward(reward):
    return np.sign(reward)



In [49]:
def fit_batch(model, target_model, gamma, start_states, actions, rewards, next_states, is_terminal):
    # Predict q values of next states, passing ones as mask
    next_q_values = target_model.predict([next_states, np.ones(actions.shape)])
    
    # Terminal state's q values are 0 by definition
    next_q_values[is_terminal] = 0
    
    q_values = rewards + gamma * np.max(next_q_values, axis=1)
    
    # Pass actions as the mask and multiply targets by the actions 
    model.fit([start_states, actions], actions * q_values[:, None], nb_epoch=1, batch_size=len(start_states), verbose=0)


In [59]:
def model(observation_space, n_actions):
    # Channels are last, change order if using different convention
    shape = observation_space.shape
    
    # Define inputs for the Functional API
    frames_input = keras.layers.Input(shape, name='frames')
    actions_input = keras.layers.Input((n_actions,), name='mask')
    
    norm = keras.layers.Lambda(lambda x: x/255.0)(frames_input)
    
    conv_1 = keras.layers.convolutional.Conv2D(16, 8, strides=4, activation='relu')(norm)
    conv_2 = keras.layers.convolutional.Conv2D(32, 4, strides=2, activation='relu')(conv_1)
    
    conv_flattened = keras.layers.core.Flatten()(conv_2)
       
    hidden = keras.layers.Dense(256, activation='relu')(conv_flattened)
    
    output = keras.layers.Dense(n_actions)(hidden)
    
    # Multiply by the mask
    filtered_output = keras.layers.merge(inputs=[output, actions_input], mode='mul')
    
    self.model = keras.models.Model(inputs=[frames_input, actions_input], output=filtered_output)
    optimizer = keras.optimizers.RMSprop(lr=0.00025, rho=0.95, epsilon=0.01)
    self.model.compile(optimizer, loss='huber_loss')


In [51]:
class RingBuf:
    def __init__(self, size):
        self.data = [None] * (size + 1)
        self.start = 0
        self.end = 0
        
    def append(self, element):
        self.data[self.end] = element
        self.end = (self.end + 1) % len(self.data)
        
        if self.end == self.start:
            self.start = (self.start + 1) % len(self.data)
            
    def __getitem__(self, index):
        return self.data[(self.start + index) % len(self.data)]
    
    def __len__(self):
        if self.end < self.start:
            return self.end + len(self.data) - self.start
        else:
            return self.end - self.start
        
    def __iter__(self):
        for i in range(len(self)):
            yield self[i]

In [52]:
def q_iteration(env, model, state, iteration, memory):
    epsilon = get_epsilon(iteration)
    
    if random.random() < epsilon:
        action = env.action_space.sample()
    else:
        action = chose_best_action(model, state)
        
    new_frame, reward, done, _ = env.step(action)
    memory.add(state, action, new_frame, reward, done)
    
    batch = memory.sample_batch(32)
    fit_batch(model, batch)

In [53]:
def copy_model(model):
    model.save('tmp_model')
    return keras.models.load_model('tmp_model')


In [55]:
def huber_loss(a, b, in_keras=True):
    error = a - b
    quadratic_term = error * error / 2
    linear_term = abs(error) - 1/2
    use_linear_term = (abs(error) > 1.0)
    if in_keras:
        use_linear_term = K.cast(use_linear_term, 'float32')
    return use_linear_term * linear_term +(1 - use_linear_term) * quadratic_term

In [61]:
EPISODES = 100
memory = RingBuf(1000)
env = gym.make('BreakoutDeterministic-v4')
frame = env.reset()
done = False
model = model(env.observation_space, env.action_space)

for iteration in range(EPISODES):
    state = env.reset()
    while not done:
        if iteration == (EPISODES-1):
            env.render()
        q_iteration(env, model, state, iteration, memory)
        
        if done:
            env.render(close=True)
            break
        

[2018-03-15 19:40:58,138] Making new env: BreakoutDeterministic-v4


TypeError: Error converting shape to a TensorShape: int() argument must be a string, a bytes-like object or a number, not 'Discrete'.