In [1]:
import gym
import keras
import random
import numpy as np
from collections import deque
from keras import backend as K

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
class Agent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = 0.95
        self.epsilon = 1.0
        self.epsilon_min = 0.001
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.optimizer_lr = 0.00025
        self.optimizer_rho = 0.95
        self.optimizer_epsilon = 0.01
        self.batch_size = 32
        self.model = self.build_model(state_size, action_size)
        
    def to_grayscale(self, img):
        return np.mean(img, axis=2).astype(np.uint8)
    
    def downsample(self, img):
        return img[::2, ::2]
    
    def preprocess(self, img):
        return self.to_grayscale(self.downsample(img))
    
    def fit_batch(self, model, target_model, gamma, start_states, actions, rewards, next_states, is_terminal):
        # Predict q values of next states, passing ones as mask
        next_q_values = target_model.predict([next_states, np.ones(actions.shape)])
        
        # Terminal state's q values are 0 by definition
        next_q_values[is_terminal] = 0
        
        q_values = rewards + gamma * np.max(next_q_values, axis=1)
        
        # Pass actions as the mask and multiply targets by the actions 
        model.fit([start_states, actions], actions * q_values[:, None], nb_epoch=1, batch_size=len(start_states),
                  verbose=0)
    
    def build_model(self, state_size, action_size):
        # Define inputs for the Functional API
        frames_input = keras.layers.Input(state_size, name='frames')
        actions_input = keras.layers.Input((action_size,), name='mask')
        
        norm = keras.layers.Lambda(lambda x: x/255.0)(frames_input)
        
        conv_1 = keras.layers.convolutional.Conv2D(16, 8, strides=4, activation='relu')(norm)
        conv_2 = keras.layers.convolutional.Conv2D(32, 4, strides=2, activation='relu')(conv_1)
        
        conv_flattened = keras.layers.core.Flatten()(conv_2)
           
        hidden = keras.layers.Dense(256, activation='relu')(conv_flattened)
        
        output = keras.layers.Dense(action_size)(hidden)
        
        # Multiply by the mask
        filtered_output = keras.layers.multiply(inputs=[output, actions_input])
        
        model = keras.models.Model(inputs=[frames_input, actions_input], output=filtered_output)
        optimizer = keras.optimizers.RMSprop(lr=self.optimizer_lr, rho=self.optimizer_rho,
                                             epsilon=self.optimizer_epsilon)
        model.compile(optimizer, loss=huber_loss)
        return model
         
    def q_iteration(self, env, model, state, iteration, memory):
        if self.epsilon > self.epsilon_min:
            self.epsilon = self.epsilon_decay * self.epsilon
            
        if random.random() < self.epsilon:
            action = env.action_space.sample()
        else:
            pred = model.predict([state, np.ones(env.action_space.n)])
            action = np.argmax(pred)
            
        new_frame, reward, done, _ = env.step(action)
        memory.append((state, action, new_frame, reward, done))
       
        if iteration > self.batch_size: 
            batch = random.sample(memory, self.batch_size)
            fit_batch(model, batch, self.gamma, state, action, reward, new_frame, done)
                
    def copy_model(self, model):
        model.save('tmp_model')
        return keras.models.load_model('tmp_model', custom_objects={'huber_loss': huber_loss})
    
    

In [4]:
EPISODES = 100
if __name__ == "__main__":
    env = gym.make('BreakoutDeterministic-v4')
    state_size = env.observation_space.shape
    action_size = env.action_space.n
    agent = Agent(state_size, action_size)
    done = False
    memory = deque(maxlen=1000)
    
    for iteration in range(EPISODES):
        state = env.reset()
        while not done:
            if iteration == (EPISODES-1):
                env.render()
            agent.q_iteration(env, agent.model, state, iteration, memory)
            
            if done:
                env.render(close=True)
                break
        

[2018-03-18 16:50:39,478] Making new env: BreakoutDeterministic-v4




NameError: name 'huber_loss' is not defined

In [24]:
class RingBuf:
    def __init__(self, size):
        self.data = [None] * (size + 1)
        self.start = 0
        self.end = 0
        
    def append(self, element):
        self.data[self.end] = element
        self.end = (self.end + 1) % len(self.data)
        
        if self.end == self.start:
            self.start = (self.start + 1) % len(self.data)
            
    def __getitem__(self, index):
        return self.data[(self.start + index) % len(self.data)]
    
    def __len__(self):
        if self.end < self.start:
            return self.end + len(self.data) - self.start
        else:
            return self.end - self.start
        
    def __iter__(self):
        for i in range(len(self)):
            yield self[i]
            
            
            

In [4]:
def huber_loss(a, b, in_keras=True):
    error = a - b
    quadratic_term = error * error / 2
    linear_term = abs(error) - 1/2
    use_linear_term = (abs(error) > 1.0)
    if in_keras:
        use_linear_term = K.cast(use_linear_term, 'float32')
    return use_linear_term * linear_term + (1 - use_linear_term) * quadratic_term


In [19]:

def to_grayscale(img):
    return np.mean(img, axis=2).astype(np.uint8)

def downsample(img):
    return img[::2, ::2]

def preprocess(img):
    return to_grayscale(downsample(img))

test = gym.make('BreakoutDeterministic-v4')
state_test = test.reset()
state_test.shape
preprocess(state_test).shape

[2018-03-18 16:56:57,129] Making new env: BreakoutDeterministic-v4


(105, 80)