# Introduction

Here I'm practicing with deep-Q networks on atari games. After I get this working, I'll try switch to the taxi scenario.

https://becominghuman.ai/lets-build-an-atari-ai-part-1-dqn-df57e8ff3b26


### Setup

In [40]:
import numpy as np
import matplotlib.pyplot as plt
import gym
import keras as ks
import tensorflow as tf
import time
#from keras.layers import InputLayer, Dense, Input
#from keras.models import Model
#from keras import backend as K
import keras
#K.set_image_dim_ordering('th')
%matplotlib inline



class Agent:
    
    def __init__(self,state):
        self.state = state
        self.epsilon = 0.5
        self.gamma = 0.99
        self.epsilon_min = 0.01
        self.memory = []
        self.memory_size = 20
        self.batch_size = 5    #how many memories to learn from in experience replay
        self.num_states = 5

        #Neural net, for predicting Qs
        inputs = Input(shape=(5,))
        x = Dense(10,activation='relu')(inputs)
        predictions = Dense(2,activation='relu')(x)
        self.model = Model(inputs=inputs,outputs=predictions)
        self.model.compile(loss='mse',optimizer='adam',metrics=['mae'])
        
        
        
    def vectorize_state(self,state,num_states):
        """ Given a state = 0,1,2,3
            return a 1 hot vector
        """
        return np.identity(num_states)[state:state+1]
        
        
    def get_epsilon_iteration(self,episode_number):
        return max(self.epsilon_min, self.epsilon / (1.0 + episode_number))
        
        
    def act(self,state,episode_number = 0):
        # epsilon greedy
        epsilon_effective = self.get_epsilon_iteration(episode_number)
        
        if np.random.random() < epsilon_effective:
            action = env.action_space.sample()
        else:
            action = self.choose_best_action(self.model,state)
        return action
                   
                   
    def choose_best_action(self,model,state):
        state_vector = np.identity(self.num_states)[state:state+1]
        Qs = model.predict(state_vector)
        action = np.argmax(Qs)
        return action
                   
                   
    def remember(self,event):
        
        if len(self.memory) <= self.memory_size:
            self.memory.append(event)
        else:
            self.memory.pop(0)
            self.memory.append(event)
        
        
    def learn(self):
        num_samples = min(len(self.memory), self.batch_size)
        if num_samples == 1 or num_samples == 0:
            pass
        else:
            indices = range(len(agent.memory))
            index_set = np.random.choice(indices,num_samples,replace=False)
            samples = [agent.memory[i] for i in indices if i in index_set]

            states = []
            Q_targets = []
            for event in samples:
                [state, action, reward, next_state] = event

                #Find Q_target
                next_state_vector = agent.vectorize_state(next_state,num_states) 
                Q_target = reward +  agent.gamma*max(agent.model.predict(next_state_vector))
                Q_targets.append(Q_target)

                #Find states
                state_vector = [1 if i == state else 0 for i in range(num_states)]
                states.append(state_vector)

            states = np.array(states)
            Q_targets = np.array(Q_targets)
            agent.model.fit(states,Q_targets,verbose=False)

### Code from website

In [None]:
import gym

# Create a breakout environment
env = gym.make('BreakoutDeterministic-v4')
# Reset it, returns the starting frame
frame = env.reset()
# Render
env.render()


is_done = False
while not is_done:
    # Perform a random action, returns the new frame, reward and whether the game is over
    frame, reward, is_done, _ = env.step(env.action_space.sample())
    # Render
    env.render()
    time.sleep(0.2)
env.close()

In [34]:
def to_grayscale(img):
    return np.mean(img, axis=2).astype(np.uint8)

def downsample(img):
    return img[::2, ::2]

def preprocess(img):
    return to_grayscale(downsample(img))

def transform_reward(reward):
    return np.sign(reward)




def fit_batch(model, gamma, start_states, actions, rewards, next_states, is_terminal):
    """Do one deep Q learning iteration.
    
    Params:
    - model: The DQN
    - gamma: Discount factor (should be 0.99)
    - start_states: numpy array of starting states
    - actions: numpy array of one-hot encoded actions corresponding to the start states
    - rewards: numpy array of rewards corresponding to the start states and actions
    - next_states: numpy array of the resulting states corresponding to the start states and actions
    - is_terminal: numpy boolean array of whether the resulting state is terminal
    
    """
    # First, predict the Q values of the next states. Note how we are passing ones as the mask.
    next_Q_values = model.predict([next_states, np.ones(actions.shape)])
    # The Q values of the terminal states is 0 by definition, so override them
    next_Q_values[is_terminal] = 0
    # The Q values of each start state is the reward + gamma * the max next state Q value
    Q_values = rewards + gamma * np.max(next_Q_values, axis=1)
    # Fit the keras model. Note how we are passing the actions as the mask and multiplying
    # the targets by the actions.
    model.fit(
        [start_states, actions], actions * Q_values[:, None],
        nb_epoch=1, batch_size=len(start_states), verbose=0
    )
    
    
    
    
def atari_model(n_actions):
    # We assume a theano backend here, so the "channels" are first.
    ATARI_SHAPE = (4, 105, 80)

    # With the functional API we need to define the inputs.
    frames_input = keras.layers.Input(ATARI_SHAPE, name='frames')
    actions_input = keras.layers.Input((n_actions,), name='mask')

    # Assuming that the input frames are still encoded from 0 to 255. Transforming to [0, 1].
    normalized = keras.layers.Lambda(lambda x: x / 255.0)(frames_input)
    
    # "The first hidden layer convolves 16 8×8 filters with stride 4 with the input image and applies a rectifier nonlinearity."
    conv_1 = keras.layers.convolutional.Convolution2D(
        16, 8, 8, subsample=(4, 4), activation='relu'
    )(normalized)
    # "The second hidden layer convolves 32 4×4 filters with stride 2, again followed by a rectifier nonlinearity."
    conv_2 = keras.layers.convolutional.Convolution2D(
        32, 4, 4, subsample=(2, 2), activation='relu'
    )(conv_1)
    # Flattening the second convolutional layer.
    conv_flattened = keras.layers.core.Flatten()(conv_2)
    # "The final hidden layer is fully-connected and consists of 256 rectifier units."
    hidden = keras.layers.Dense(256, activation='relu')(conv_flattened)
    # "The output layer is a fully-connected linear layer with a single output for each valid action."
    output = keras.layers.Dense(n_actions)(hidden)
    # Finally, we multiply the output by the mask!
    filtered_output = keras.layers.merge.multiply([output, actions_input])

    model = keras.models.Model(input=[frames_input, actions_input], output=filtered_output)
    optimizer = keras.optimizers.RMSprop(lr=0.00025, rho=0.95, epsilon=0.01)
    model.compile(optimizer, loss='mse')
    return model
    
    
def get_epsilon_iteration(epsilon,episode_number,num_episodes):
        #return max(self.epsilon_min, self.epsilon / (1.0 + episode_number))
            epsilon_min = 0.01
            slope = epsilon_min - epsilon
            epsilon_effective = slope*(episode_number/(1.0*num_episodes)) + epsilon
            return epsilon_effective
    
    
def q_iteration(env, model, state, iteration, memory):
    # Choose epsilon based on the iteration
    epsilon = get_epsilon_for_iteration(iteration)

    # Choose the action 
    if random.random() < epsilon:
        action = env.action_space.sample()
    else:
        action = choose_best_action(model, state)

    # Play one game iteration (note: according to the next paper, you should actually play 4 times here)
    new_frame, reward, is_done, _ = env.step(action)
    memory.add(state, action, new_frame, reward, is_done)

    # Sample and fit
    batch = memory.sample_batch(32)
    fit_batch(model, batch)
    
    
    
class Memory:
    
    def __init__(self):
        self.buffer = []
        
        
    def add(self,event):
        self.buffer.append(event)
        
        
    def sample_batch(self, size):
        try:
            return random.sample(self.buffer, size)
        except ValueError:
            return self.buffer     

In [5]:
import gym

env = gym.make('BreakoutDeterministic-v4')
frame = env.reset()
state = preprocess(frame)
env.render()
num_episodes = 10**2
epsilon = 0.5
memory = Memory()
model = atari_model(env.action_space.n)

is_done = False
for episode in range(num_episodes): 

    env.reset()
    is_done = False
    while not is_done:
        
        #Decide action
        epsilon = get_epsilon_iteration(epsilon, episode, num_episodes)
        if np.random.random() < epsilon:
            action = env.action_space.sample()
        else:
            Qs = model.predict(state)
            action = max(Qs)

        #Take step
        new_frame, reward, is_done, _ = env.step(action)
        new_state = preprocess(frame)
        memory.add([state, action, reward, new_state, is_done])
        
        # Sample and fit
        batch = memory.sample_batch(32)
        fit_batch(model, batch)
        
        # Render
        env.render()
        #time.sleep(0.2)
env.close()



ValueError: Negative dimension size caused by subtracting 8 from 4 for 'conv2d_1/convolution' (op: 'Conv2D') with input shapes: [?,4,105,80], [8,8,80,16].

In [12]:
def atari_model(n_actions):
    # We assume a theano backend here, so the "channels" are first.
    #ATARI_SHAPE = (4, 105, 80)
    ATARI_SHAPE = (105, 80, 4)

    # With the functional API we need to define the inputs.
    frames_input = keras.layers.Input(ATARI_SHAPE, name='frames')
    actions_input = keras.layers.Input((n_actions,), name='mask')

    # Assuming that the input frames are still encoded from 0 to 255. Transforming to [0, 1].
    normalized = keras.layers.Lambda(lambda x: x / 255.0)(frames_input)
    
    # "The first hidden layer convolves 16 8×8 filters with stride 4 with the input image and applies a rectifier nonlinearity."
    conv_1 = keras.layers.convolutional.Convolution2D(
        16, 8, 8, subsample=(4, 4), activation='relu'
    )(normalized)
    # "The second hidden layer convolves 32 4×4 filters with stride 2, again followed by a rectifier nonlinearity."
    conv_2 = keras.layers.convolutional.Convolution2D(
        32, 4, 4, subsample=(2, 2), activation='relu'
    )(conv_1)
    # Flattening the second convolutional layer.
    conv_flattened = keras.layers.core.Flatten()(conv_2)
    # "The final hidden layer is fully-connected and consists of 256 rectifier units."
    hidden = keras.layers.Dense(256, activation='relu')(conv_flattened)
    # "The output layer is a fully-connected linear layer with a single output for each valid action."
    output = keras.layers.Dense(n_actions)(hidden)
    # Finally, we multiply the output by the mask!
    filtered_output = keras.layers.merge.multiply([output, actions_input])

    model = keras.models.Model(input=[frames_input, actions_input], output=filtered_output)
    optimizer = keras.optimizers.RMSprop(lr=0.00025, rho=0.95, epsilon=0.01)
    model.compile(optimizer, loss='mse')
    return model

In [13]:
model = atari_model(env.action_space.n)
model.predict(frame)

  from ipykernel import kernelapp as app


ValueError: Error when checking model input: the list of Numpy arrays that you are passing to your model is not the size the model expected. Expected to see 2 array(s), but instead got the following list of 1 arrays: [array([[[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        ...,
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]],

       [[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        ...,
...

In [36]:
next_Q_values = model.predict([next_history, actions_mask])

NameError: name 'next_history' is not defined

In [71]:
frame,_,_,_ = env.step(1)
state = preprocess(frame)
temp = np.stack([state,state,state,state],axis=2)
#history = np.reshape([temp], (1, 84, 84, 4))

In [72]:
state = preprocess(frame)
history = np.stack((state, state, state, state), axis=2)
history = np.reshape([history], (1, 84, 84, 4))

ValueError: cannot reshape array of size 33600 into shape (1,84,84,4)

In [66]:
q_value = model.predict([history, np.ones(ACTION_SIZE).reshape(1, ACTION_SIZE)])
q_value

ValueError: Error when checking input: expected frames to have 4 dimensions, but got array with shape (4, 105, 80)

In [53]:
ACTION_SIZE = 4
q_value = model.predict([history, np.ones(ACTION_SIZE).reshape(1, ACTION_SIZE)])

ValueError: Error when checking input: expected frames to have 4 dimensions, but got array with shape (4, 105, 80)