In [1]:
# Import libraries

import tensorflow as tf
import random
import graphical, game
import numpy as np
import os

from collections import deque, namedtuple

pygame 1.9.6
Hello from the pygame community. https://www.pygame.org/contribute.html


In [2]:
N_Rows = 10
N_Cols = 8
N_Dir = 2

report_frequency = 100

All_Actions = np.arange(0,N_Rows*N_Cols*N_Dir)

N_All_Actions = len(All_Actions)
N_State = N_Rows * N_Cols + 1 # board game state + moves_left

def get_state(board, moves_left):
    state = np.zeros(len(board) - (N_Rows-1) + 1)
    
    c_state_index = 0
    for s in range(0,len(board)):
        if board[s] != '\n':
            #print(board[s])
            state[c_state_index] = ord(board[s]) - ord('a')
            state[c_state_index] = -1 if state[c_state_index] < 0 else state[c_state_index]
            c_state_index += 1
            
    state[-1] = moves_left
    return state

def get_action_from(move):
    action = np.array(move)
    
    if move[2]:
        action[2] = 1
    else:
        action[2] = 0
    
    out_action = (action[2]) * (N_Rows * N_Cols) + (action[0] * N_Rows + action[1])
    
    return out_action

def get_move_from(action):
    row_col = action % (N_Rows * N_Cols)
    
    dir = int(action / (N_Rows * N_Cols))
    
    return (int(row_col / N_Rows), row_col % N_Rows, dir >= 1)

# test action conversion
num_error_in_conversion = 0
for i in range(0,160):
    a = get_move_from(i)
    ii = get_action_from(a)
    if i != ii:
        num_error_in_conversion += 1
print("number of errors happens in action conversion: ", num_error_in_conversion)

number of errors happens in action conversion:  0


In [3]:
class QNetwork():
    """Q-Value Estimator neural network.

    This network is used for both the Q-Network and the Target Network.
    """

    def __init__(self, scope="estimator", summaries_dir=None):
        self.scope = scope
        # Writes Tensorboard summaries to disk
        self.summary_writer = None
        with tf.variable_scope(scope):
            # Build the graph
            self._build_model()
            if summaries_dir:
                summary_dir = os.path.join(summaries_dir, "summaries_{}".format(scope))
                if not os.path.exists(summary_dir):
                    os.makedirs(summary_dir)
                self.summary_writer = tf.summary.FileWriter(summary_dir)

    def _build_model(self):
        """
        Builds the Tensorflow graph.
        """

        # Placeholders for our input
        # Our inputs are board game state with shape of (None, N_State)
        self.X_pl = tf.placeholder(shape=[None, N_State], dtype=tf.uint8, name="X")
        # The TD target value
        self.y_pl = tf.placeholder(shape=[None], dtype=tf.float32, name="y")
        # Integer id of which action was selected
        self.actions_pl = tf.placeholder(shape=[None], dtype=tf.int32, name="actions")

        X = (tf.to_float(self.X_pl) + 1) / 5.0 # normalize input between (0,1)
        batch_size = tf.shape(self.X_pl)[0]

        # Three fully connected layers
        fully1 = tf.contrib.layers.fully_connected(X, 100, activation_fn=tf.nn.relu)      # 80 to 100
        fully2 = tf.contrib.layers.fully_connected(fully1, 120, activation_fn=tf.nn.relu) # 100 to 120
        fully3 = tf.contrib.layers.fully_connected(fully2, 140, activation_fn=tf.nn.relu) # 120 to 140

        # output layers
        self.predictions = tf.contrib.layers.fully_connected(fully3, N_All_Actions)  # 140 to 160

        # Get the predictions for the chosen actions only
        gather_indices = tf.range(batch_size) * tf.shape(self.predictions)[1] + self.actions_pl
        self.action_predictions = tf.gather(tf.reshape(self.predictions, [-1]), gather_indices)

        # Calculate the loss
        self.losses = tf.squared_difference(self.y_pl, self.action_predictions)
        self.loss = tf.reduce_mean(self.losses)

        # Optimizer Parameters from original paper
        self.optimizer = tf.train.RMSPropOptimizer(0.00025, 0.99, 0.0, 1e-6)
        self.train_op = self.optimizer.minimize(self.loss, global_step=tf.contrib.framework.get_global_step())

        # Summaries for Tensorboard
        self.summaries = tf.summary.merge([
            tf.summary.scalar("loss", self.loss),
            tf.summary.histogram("loss_hist", self.losses),
            tf.summary.histogram("q_values_hist", self.predictions),
            tf.summary.scalar("max_q_value", tf.reduce_max(self.predictions))
        ])

    def predict(self, sess, s):
        """
        Predicts action values.

        Args:
          sess: Tensorflow session
          s: State input of shape [batch_size, N_State]

        Returns:
          Tensor of shape [batch_size, N_All_Actions] containing the estimated 
          action values.
        """
        return sess.run(self.predictions, { self.X_pl: s })

    def update(self, sess, s, a, y, episode_num):
        """
        Updates the estimator towards the given targets.

        Args:
          sess: Tensorflow session object
          s: State input of shape [batch_size, N_State]
          a: Chosen actions of shape [batch_size]
          y: Targets of shape [batch_size]

        Returns:
          The calculated loss on the batch.
        """
        feed_dict = { self.X_pl: s, self.y_pl: y, self.actions_pl: a }
        summaries, global_step, _, loss = sess.run(
            [self.summaries, tf.contrib.framework.get_global_step(), self.train_op, self.loss],
            feed_dict)
        #if self.summary_writer and episode_num % report_frequency == 0:
        #    self.summary_writer.add_summary(summaries, global_step)
        return loss

In [4]:
class QNetworkCopier():
    """
    Copy model parameters of one estimator to another.
    """
    
    def __init__(self, estimator1, estimator2):
        """
        Defines copy-work operation graph.  
        Args:
          estimator1: Estimator to copy the paramters from
          estimator2: Estimator to copy the parameters to
        """
        e1_params = [t for t in tf.trainable_variables() if t.name.startswith(estimator1.scope)]
        e1_params = sorted(e1_params, key=lambda v: v.name)
        e2_params = [t for t in tf.trainable_variables() if t.name.startswith(estimator2.scope)]
        e2_params = sorted(e2_params, key=lambda v: v.name)

        self.update_ops = []
        for e1_v, e2_v in zip(e1_params, e2_params):
            op = e2_v.assign(e1_v)
            self.update_ops.append(op)
            
    def make(self, sess):
        """
        Makes copy.
        Args:
            sess: Tensorflow session instance
        """
        sess.run(self.update_ops)

In [5]:
def make_epsilon_greedy_policy(estimator, nA):
    """
    Creates an epsilon-greedy policy based on a given Q-function approximator and epsilon.

    Args:
        estimator: An estimator that returns q values for a given state
        nA: Number of actions in the environment.

    Returns:
        A function that takes the (sess, observation, epsilon) as an argument and returns
        the probabilities for each action in the form of a numpy array of length nA.

    """
    def policy_fn(sess, observation, epsilon):
        A = np.ones(nA, dtype=float) * epsilon / nA
        q_values = estimator.predict(sess, np.expand_dims(observation, 0))[0]
        best_action = np.argmax(q_values)
        A[best_action] += (1.0 - epsilon)
        return A
    return policy_fn

In [6]:
Transition = namedtuple("Transition", ["state", "action", "reward", "next_state", "done"])

class deep_q_learning():
    def __init__(self, sess,
                 q_estimator,
                 target_estimator,
                 num_episodes,
                 experiment_dir,
                 replay_memory_size=500000,
                 replay_memory_init_size=50000,
                 update_target_estimator_every=10000,
                 discount_factor=0.99,
                 epsilon_start=1.0,
                 epsilon_end=0.1,
                 epsilon_decay_steps=500000,
                 batch_size=32):
        """
        Q-Learning algorithm for off-policy TD control using Function Approximation.
        Finds the optimal greedy policy while following an epsilon-greedy policy.

        Args:
            sess: Tensorflow Session object
            q_estimator: Estimator object used for the q values
            target_estimator: Estimator object used for the targets
            num_episodes: Number of episodes to run for
            experiment_dir: Directory to save Tensorflow summaries in
            replay_memory_size: Size of the replay memory
            replay_memory_init_size: Number of random experiences to sampel when initializing 
                                     the reply memory.
            update_target_estimator_every: Copy parameters from the Q estimator to the 
                                           target estimator every N steps
            discount_factor: Gamma discount factor
            epsilon_start: Chance to sample a random action when taking an action.
                           Epsilon is decayed over time and this is the start value
            epsilon_end: The final minimum value of epsilon after decaying is done
            epsilon_decay_steps: Number of steps to decay epsilon over
            batch_size: Size of batches to sample from the replay memory
        """
        
        self.sess = sess
        self.q_estimator = q_estimator
        self.target_estimator = target_estimator
        self.num_episodes = num_episodes
        self.experiment_dir = experiment_dir
        self.replay_memory_size = replay_memory_size
        self.replay_memory_init_size = replay_memory_init_size
        self.update_target_estimator_every = update_target_estimator_every
        self.discount_factor = discount_factor
        self.epsilon_start = epsilon_start
        self.epsilon_end = epsilon_end
        self.epsilon_decay_steps = epsilon_decay_steps
        self.batch_size = batch_size
        
        self.cur_episode = 0
        self.cur_t = 0
        self.loss = None
        
        # The replay memory
        self.replay_memory = []
        
        # Make model copier object
        self.estimator_copy = QNetworkCopier(self.q_estimator, self.target_estimator)

        # Keeps track of useful statistics
        self.stats = {'q_net_loss':0, 'episode_rewards':0, 'epsilon':0, 'counter_observation':0, 'counter_episode':0}

        # Create directories for checkpoints and summaries
        self.checkpoint_dir = os.path.join(self.experiment_dir, "checkpoints")
        self.checkpoint_path = os.path.join(self.checkpoint_dir, "model")

        if not os.path.exists(self.checkpoint_dir):
            os.makedirs(self.checkpoint_dir)

        self.saver = tf.train.Saver()
        # Load a previous checkpoint if we find one
        latest_checkpoint = tf.train.latest_checkpoint(self.checkpoint_dir)
        if latest_checkpoint:
            print("Loading model checkpoint {}...\n".format(latest_checkpoint))
            self.saver.restore(self.sess, latest_checkpoint)

        # Get the current time step
        self.total_t = self.sess.run(tf.contrib.framework.get_global_step())

        # The epsilon decay schedule
        self.epsilons = np.linspace(self.epsilon_start, self.epsilon_end, self.epsilon_decay_steps)

        # The policy we're following
        self.policy = make_epsilon_greedy_policy(self.q_estimator, N_All_Actions)
    
    def collect_observation(self, board, move, score_delta, next_board, moves_left):
        state = get_state(board, moves_left + 1)
        reward = score_delta / 100.0
        action = get_action_from(move)
        n_state = get_state(next_board, moves_left)
        done = (moves_left == 0)
        
        # If our replay memory is full, pop the first element
        if len(self.replay_memory) == self.replay_memory_size:
            self.replay_memory.pop(0)
        
        self.replay_memory.append(Transition(state, action, reward, n_state, done))
        
        if len(self.replay_memory) < self.replay_memory_init_size:
            return
        
        # Update statistics
        if self.loss is not None:
            self.stats['counter_observation'] += 1
            self.stats['epsilon'] += self.epsilons[min(self.total_t, self.epsilon_decay_steps-1)]
            self.stats['episode_rewards'] += reward
            self.stats['q_net_loss'] += self.loss
        
        self.cur_t += 1
        return
    
    def predict_action(self, board, score, moves_left):
        state = get_state(board, moves_left)
        action_probs = self.policy(self.sess, state, self.epsilons[min(self.total_t, self.epsilon_decay_steps-1)])
        action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
        
        return get_move_from(All_Actions[action])
    
    def update(self):
        if len(self.replay_memory) < self.replay_memory_init_size:
            return
        
        # Sample a minibatch from the replay memory
        samples = random.sample(self.replay_memory, self.batch_size)
        states_batch, action_batch, reward_batch, next_states_batch, done_batch = map(np.array, zip(*samples))

        # Calculate q values and targets
        q_values_next = target_estimator.predict(self.sess, next_states_batch)
        targets_batch = reward_batch \
                      + np.invert(done_batch).astype(np.float32) \
                      * self.discount_factor \
                      * np.amax(q_values_next, axis=1)

        # Perform gradient descent update
        states_batch = np.array(states_batch)
        self.loss = q_estimator.update(sess, states_batch, action_batch, targets_batch, self.cur_episode)

        self.total_t += 1
        
        # Maybe update the target estimator
        if self.total_t % self.update_target_estimator_every == 0:
            self.estimator_copy.make(self.sess)
            print("\nCopied model parameters to target network.")

        # Print out which step we're on, useful for debugging.
        print("\rStep {} ({}) @ Episode {}/{}, loss: {}".format(
                self.cur_t, self.total_t, self.cur_episode + 1, self.num_episodes, self.loss), end="")
    
    def init_episode(self):
        self.cur_t = 0
        
        if len(self.replay_memory) < self.replay_memory_init_size:
            return
        # Save the current checkpoint
        self.saver.save(self.sess, self.checkpoint_path)
        
        # Add summaries to tensorboard
        if self.stats['counter_observation'] > 1000:
            episode_summary = tf.Summary()
            episode_summary.value.add(
                simple_value=self.stats['epsilon'] / self.stats['counter_observation'], tag="episode/epsilon")
            episode_summary.value.add(
                simple_value=self.stats['episode_rewards'] / self.stats['counter_episode'], tag="episode/reward")
            episode_summary.value.add(
                simple_value=self.stats['q_net_loss'] / self.stats['counter_observation'], tag="QNetLoss")
            q_estimator.summary_writer.add_summary(episode_summary, self.cur_episode)
            q_estimator.summary_writer.flush()
            
            self.stats['counter_observation'] = 0
            self.stats['counter_episode'] = 0
            self.stats['epsilon'] = 0
            self.stats['episode_rewards'] = 0
            self.stats['q_net_loss'] = 0
        
        # Reset
        self.cur_episode += 1
        self.stats['counter_episode'] += 1

In [7]:
global my_dqn

def ai_callback(board, score, moves_left):
    global my_dqn
    
    predicted_move = my_dqn.predict_action(board, score, moves_left)
    #print(predicted_move)
    
    #dir = random.randint(0, 1) == 0
    #return (random.randint(0, 7 if dir else 6), random.randint(0, 8 if dir else 9), dir)
    return predicted_move

def transition_callback(board, move, score_delta, next_board, moves_left):
    global my_dqn
    
    my_dqn.collect_observation(board, move, score_delta, next_board, moves_left)
    my_dqn.update()
    
    pass # This can be used to monitor outcomes of moves

def end_of_game_callback(boards, scores, moves, final_score):
    global my_dqn
    
    my_dqn.init_episode()
    
    return True # True = play another, False = Done


if __name__ == '__main__':
    global my_dqn
    
    tf.reset_default_graph()

    # Where we save our checkpoints and graphs
    experiment_dir = os.path.abspath("./experiments/{}".format("ubisoft-game"))

    # Create a glboal step variable
    global_step = tf.Variable(0, name='global_step', trainable=False)

    # Create estimators
    q_estimator = QNetwork(scope="q_estimator", summaries_dir=experiment_dir)
    target_estimator = QNetwork(scope="target_q")
    
    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    
    my_dqn = deep_q_learning(sess,
                             q_estimator=q_estimator,
                             target_estimator=target_estimator,
                             experiment_dir=experiment_dir,
                             num_episodes=10000,
                             replay_memory_size=500000,
                             replay_memory_init_size=50000,
                             update_target_estimator_every=10000,
                             epsilon_start=1.0,
                             epsilon_end=0.1,
                             epsilon_decay_steps=500000,
                             discount_factor=0.99,
                             batch_size=32)

    speedup = 1000.0
    g = graphical.Game(ai_callback, transition_callback, end_of_game_callback, speedup)
    g.run()


Instructions for updating:
Please switch to tf.train.get_global_step


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Seed: 81170877550159050235589795067197456223
Step 23 (9999) @ Episode 401/10000, loss: 74.680908203125254
Copied model parameters to target network.
Step 23 (19999) @ Episode 801/10000, loss: 22.568321228027344
Copied model parameters to target network.
Step 23 (29999) @ Episode 1201/10000, loss: 17.420785903930664
Copied model parameters to target network.
Step 23 (39999) @ Episode 1601/10000, loss: 17.616834640502932
Copied model parameters to target network.
Step 23 (49999) @ Episode 2001/10000, loss: 28.768297195434572
Copied model parameters to target network.
Step 23 (59999) @ Episode 2401/10000, loss: 10.950723648071289
Copied model parameters to target network.
Step 23 (69999) @ Episode 2801/10000, loss: 11.847319602966309
Copied model parameters to target network.
Step 23 (79999) @ Episode 3201/10000, loss: 9.4709930419921887
Copied model parameters to target network.
Step 23 (89999) @ Episode 3601/10000, loss: 12.173801422119148
Copied model parameters to target network.
Step

KeyboardInterrupt: 