In [1]:
import gym
import gym.spaces
import tensorflow as tf
import keras
from keras import backend as K #Idk why backend is required.
import numpy as np
import random
import queue
%matplotlib inline
import matplotlib.pyplot as plt
from skimage.transform import resize
from skimage.color import rgb2gray
from collections import deque

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
#Class to store memory
class RingBuf:
    def __init__(self, size):
        # Pro-tip: when implementing a ring buffer, always allocate one extra element,
        # this way, self.start == self.end always means the buffer is EMPTY, whereas
        # if you allocate exactly the right number of elements, it could also mean
        # the buffer is full. This greatly simplifies the rest of the code.
        self.data = [None] * (size + 1)
        self.start = 0
        self.end = 0
        
    def append(self, element):
        self.data[self.end] = element
        self.end = (self.end + 1) % len(self.data)
        # end == start and yet we just added one element. This means the buffer has one
        # too many element. Remove the first element by incrementing start.
        if self.end == self.start:
            self.start = (self.start + 1) % len(self.data)
        
    def __getitem__(self, idx):
        return self.data[(self.start + idx) % len(self.data)]
    
    def __len__(self):
        if self.end < self.start:
            return self.end + len(self.data) - self.start
        else:
            return self.end - self.start
        
    def __iter__(self):
        for i in range(len(self.data)):
            yield self.data[i]

In [3]:
#function that chooses the best action:
def choose_best_action(model,state):
    x = np.expand_dims(state, axis=0)
    mask=np.ones((1,6))
    #mask = [1 , 1, 1 , 1 , 1, 1]
    print(mask.shape)
    best_action = np.argmax(model.predict([x, mask], verbose=1))
    print(best_action)
    return best_action

In [4]:
#modified 
def one_hot_encoding(value):
    output = np.zeros((1,6))
    output[0][value] = 1
    return output

In [5]:
#The keras model:
#modified
def Q_model(n_actions,ATARI_SHAPE):
    # I think n_actions is the one hot coded action array. Wrong. It is the number of actions.
    # Input takes the one hot coded action array as a mask.
    # We assume a theano backend here, so the "channels" are first.
    #ATARI_SHAPE = (1, 84, 84, 4) #not sure about this when tensorflow is used

    # With the functional API we need to define the inputs.
    frames_input = keras.layers.Input((ATARI_SHAPE), name='frames')
    actions_input = keras.layers.Input((n_actions,), name='mask')

    # Assuming that the input frames are still encoded from 0 to 255. Transforming to [0, 1].
    # normalized = keras.layers.Lambda(lambda x: x / 255.0)(frames_input)
    
    # "The first hidden layer convolves 16 8×8 filters with stride 4 with the input image and applies a rectifier nonlinearity."
    conv_1 = keras.layers.Conv2D(16, (8, 8), strides=(4, 4), activation='relu',input_shape=(ATARI_SHAPE,))(frames_input)
    # "The second hidden layer convolves 32 4×4 filters with stride 2, again followed by a rectifier nonlinearity."
    conv_2 = keras.layers.Conv2D(32, (4, 4), strides=(2, 2), activation='relu')(conv_1)
    # Flattening the second convolutional layer.
    conv_flattened = keras.layers.core.Flatten()(conv_2)
    # "The final hidden layer is fully-connected and consists of 256 rectifier units."
    hidden = keras.layers.Dense(256, activation='relu')(conv_flattened)
    # "The output layer is a fully-connected linear layer with a single output for each valid action."
    output = keras.layers.Dense(n_actions)(hidden)
    # Finally, we multiply the output by the mask!
    #filtered_output = keras.layers.merge([output, actions_input], mode='mul')
    filtered_output = keras.layers.multiply(([output, actions_input]))

    model = keras.models.Model(input=[frames_input, actions_input], output=filtered_output)
    optimizer = keras.optimizers.RMSprop(lr=0.00025, rho=0.95, epsilon=0.01)
    model.compile(optimizer, loss='mse')
    return model  #Idk if this return statement is needed.

In [6]:
#The function to get epsilon
def get_epsilon_for_iteration(iteration):
    if(iteration>1000000):
        return 0.1
    else:
        return(1 - 9*iteration/10000000)

In [7]:
def experience_replay():
    batch = 32 #number of elements in a batch.
    batch_state_mem = RingBuf(batch)
    batch_action_mem = RingBuf(batch)
    batch_q_mem = RingBuf(batch)
    rand_int_list = []
    for i in range(batch):
        index = gen_rand()
        while index in rand_int_list:
            index = gen_rand()
        rand_int_list.append(index)
        batch_state_mem.append(state_memory._getitem_(index))
        batch_action_mem.append(action_memory._getitem_(index))
        Qcalc = reward_memory._getitem_(index) + gamma*(np.max(model.predict([next_state_memory._getitem_(index), np.ones(n_actions)], verbose=1)))
        Q_calc = Qcalc * action_memory._getitem_(index)
        batch_q_mem.append(Q_calc)
    model.fit(x=[batch_state_mem, batch_action_mem], y = batch_q_mem, verbose = 1)

In [8]:
def preprocess(img):
    return resize(rgb2gray(img), (110, 84))[18:115 - 13, :] 

In [9]:
#Function to transform the reward.
def transform_reward(reward):
    #return np.sign(reward)
    return reward

In [10]:
#Function to fit a batch
#As for now, this function is a waste.
def fit_batch(model, gamma, start_states, actions, rewards, next_states, is_terminal):
    """Do one deep Q learning iteration.
    
    Params:
    - model: The DQN
    - gamma: Discount factor (should be 0.99)
    - start_states: numpy array of starting states
    - actions: numpy array of one-hot encoded actions corresponding to the start states
    - rewards: numpy array of rewards corresponding to the start states and actions
    - next_states: numpy array of the resulting states corresponding to the start states and actions
    - is_terminal: numpy boolean array of whether the resulting state is terminal
    
    """
    # First, predict the Q values of the next states. Note how we are passing ones as the mask.
    next_Q_values = model.predict([next_states, np.ones(actions.shape)])
    # The Q values of the terminal states is 0 by definition, so override them
    next_Q_values[is_terminal] = 0
    # The Q values of each start state is the reward + gamma * the max next state Q value
    Q_values = rewards + gamma * np.max(next_Q_values, axis=1)
    # Fit the keras model. Note how we are passing the actions as the mask and multiplying
    # the targets by the actions.
    model.fit([start_states, actions], actions * Q_values[:, None], nb_epoch=1, batch_size=len(start_states), verbose=0)

In [11]:
#Function to generate random numbers
def gen_rand():
    return random.randint(Q_calc_memory.start, Q_calc_memory.end)

In [17]:
#The initial function which does q-iteration
# Create the environment
env = gym.make('Pong-v0')
no_memory_elements=1000
n_actions = env.action_space.n
state_memory = RingBuf(no_memory_elements)
next_state_memory = RingBuf(no_memory_elements)
#Q_calc_memory = RingBuf(no_memory_elements)
action_memory = RingBuf(no_memory_elements)
reward_memory = RingBuf(no_memory_elements)
frame=env.reset()
state=preprocess(frame)
next_state=preprocess(frame)

for i in range (3):
    action=env.action_space.sample()
    new_frame, reward, is_done, _ = env.step(action)
    new_frame= preprocess(new_frame)
    state=np.dstack((state,new_frame))

ATARI_SHAPE=state.shape

no_episodes = 10
iteration=0
gamma=0.99 #discount factor
alpha = 0.01 #learning rate
model = Q_model(n_actions, ATARI_SHAPE)
for i in range(no_episodes):
    # Reset it, returns the starting frame
    frame=env.reset()
    state=preprocess(frame);
    next_state=preprocess(frame);
    # Render
    #env.render()
    for i in range (3):
        action=env.action_space.sample()
        new_frame, reward, is_done, _ = env.step(action)
        new_frame= preprocess(new_frame)
        state=np.dstack((state,new_frame))
        next_state=np.dstack((next_state,new_frame))
        action1=action
    is_done = False
    count=3
    #state_mem_episode = np.copy(state)
    #next_state_mem_episode = np.copy(next_state)
    while not is_done:
        if(count%4 ==0):
            count=count+1
            iteration=iteration+1
            epsilon = get_epsilon_for_iteration(iteration)
            # Choose the action 
            if random.random() < epsilon:
                action = env.action_space.sample()
            else:
                action = choose_best_action(model, state)
                print("returned")
            new_frame, reward, is_done, _ = env.step(action)
            if(is_done):
                #continue #problem here : it goes out of for loop also?
                pass
            new_frame= preprocess(new_frame)
            reward=transform_reward(reward)
            next_state=np.delete(next_state,0,2)
            next_state=np.dstack((next_state,new_frame))
            ##Qcalc = reward + gamma* (np.max(model.predict([next_state, np.ones(n_actions)], verbose=0)))
            ##Q_calc = Qcalc * one_hot_encoding(action)
            ##Q_calc_memory.append(Q_calc)
            state_memory.append(state)
            next_state_memory.append(next_state)
            reward_memory.append(reward)
            action_memory.append(one_hot_encoding(action))
            state=np.delete(state,0,2)
            state=np.dstack((state,new_frame))
            action1=action
        else:
            new_frame, reward, is_done, _ = env.step(action1)
            new_frame= preprocess(new_frame)
            state=np.delete(state,0,2)
            state=np.dstack((state,new_frame))
            next_state=np.delete(next_state,0,2)
            next_state=np.dstack((next_state,new_frame))
            count=count+1
    print("Number of steps = {}".format(count))

Qcalc_memory = reward_memory.data + gamma*(np.max(model.predict([next_state_memory.data, np.ones((1,6))], verbose=0, steps = 1)))
Q_calc_memory = Qcalc_memory * one_hot_encoding(action)
model.fit(x=[state_memory.data, action_memory.data], y = Q_calc_memory, verbose = 1)
no_replays = 2
for j in range (no_replays):
    experience_replay()
final()
env.close()
#env.close()
#fit batches now.
#also fit in each inner loop

  warn("The default mode, 'constant', will be changed to 'reflect' in "


Number of steps = 1015
Number of steps = 1191
Number of steps = 1337
Number of steps = 1220
Number of steps = 1275
Number of steps = 1035
Number of steps = 1106
Number of steps = 1156
(1, 6)
1
returned
(1, 6)
1
returned
(1, 6)
1
returned
Number of steps = 1110
Number of steps = 1089


ValueError: operands could not be broadcast together with shapes (1001,) (1,6) 

In [None]:
#The final function which plays
# Reset the environment, returning the starting frame
def final()
    new_frame = env.reset()
    state.extend(new_frame)
    # Render
    env.render()
    for i in range (3):
            action=env.action_space.sample()
            new_frame, reward, is_done, _ = env.step(action)
            new_frame= preprocess(new_frame)
            state.extend(new_frame)
    count=0
    is_done = False
    while not is_done:
        action = choose_best_action(model, state)
        # Perform the best action, returns the new frame, reward and whether the game is over
        frame, reward, is_done, _ = env.step(action)
        new_frame= preprocess(new_frame)
        del state[0:shape_to_del]
        state.extend(new_frame)    
        # Render
        env.render()