In [2]:
import logging
import random
from time import time
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

In [5]:
def train():
    costs = []  # this will store the costs, so we can plot them later
    r1 = []  # same, but for the players total rewards
    r2 = []
    random.seed(int(time()*1000))
    tf.reset_default_graph()
    logging.basicConfig(level=logging.WARN, format='%(message)s')

    # Initialize players
    p1 = players.QPlayer([100,160,160,100],
                         learning_batch_size=150, batches_to_q_target_switch=1000,
                         gamma=0.95, tau=0.95, memory_size=100000)
    p1.restore('./models/q.ckpt')
    p1.name = 'Q'


    p2 = players.Novice()
    p2.name = 'N'
    
    total_rewards = {p1.name: 0, p2.name: 0}

    # Start playing
    num_of_games = 400000
    for g in range(1,num_of_games+1):
        game = Game(p1,p2) if g%2==0 else Game(p2,p1)  # make sure both players play X and O
        last_phases = {p1.name: None, p2.name: None}  # will be used to store the last state a player was in
        while not game.game_status()['game_over']:
            if isinstance(game.active_player(), players.Human):
                game.print_board()
                print("{}'s turn:".format(game.active_player().name))

            # If this is not the first move, store in memory the transition from the last state
            # the active player saw to this one
            state = np.copy(game.board)
            if last_phases[game.active_player().name] is not None:
                memory_element = last_phases[game.active_player().name]
                memory_element['next_state'] = state
                memory_element['game_over'] = False
                game.active_player().add_to_memory(memory_element)

            # Calculate annealed epsilon
            if g <= num_of_games // 4:
                max_eps = 0.6
            elif g <= num_of_games // 2:
                max_eps = 0.01
            else:
                max_eps = 0.001
            min_eps = 0.01 if g <= num_of_games // 2 else 0.0
            eps = round(max(max_eps - round(g*(max_eps-min_eps)/num_of_games, 3), min_eps), 3)

            # Play and receive reward
            action = int(game.active_player().select_cell(state, epsilon=eps))
            play_status = game.play(action)
            game_over = play_status['game_over']
            if play_status['invalid_move']:
                r = game.invalid_move_reward
            elif game_over:
                if play_status['winner'] == 0:
                    r = game.tie_reward
                else:
                    r = game.winning_reward
            else:
                r = 0

            # Store the current state in temporary memory
            last_phases[game.active_player().name] = {'state': state,
                                                      'action': action,
                                                      'reward': r}
            total_rewards[game.active_player().name] += r

            # Activate learning procedure
            cost = game.active_player().learn(learning_rate=0.0001)
            if cost is not None:
                costs.append(cost)

            # Next player's turn, if game hasn't ended
            if not game_over:
                game.next_player()

        # Adding last phase for winning (active) player
        memory_element = last_phases[game.active_player().name]
        memory_element['next_state'] = np.zeros(9)
        memory_element['game_over'] = True
        game.active_player().add_to_memory(memory_element)

        # Adding last phase for losing (inactive) player
        memory_element = last_phases[game.inactive_player().name]
        memory_element['next_state'] = np.zeros(9)
        memory_element['game_over'] = True
        memory_element['reward'] = game.losing_reward
        game.inactive_player().add_to_memory(memory_element)

        # Print statistics
        if g % 100 == 0:
            print('Game: {g} | Number of Trainings: {t} | Epsilon: {e} | Average Rewards - {p1}: {r1}, {p2}: {r2}'
                  .format(g=g, p1=p1.name, r1=total_rewards[p1.name]/100.0,
                          p2=p2.name, r2=total_rewards[p2.name]/100.0,
                          t=len(costs), e=eps))
            r1.append(total_rewards[p1.name]/100.0)
            r2.append(total_rewards[p2.name]/100.0)
            total_rewards = {p1.name: 0, p2.name: 0}

    # Save trained model and shutdown Tensorflow sessions
    p1.save('./models/q.ckpt')
    for pp in [p1,p2]:
        pp.shutdown()

    # Plot graphs
    plt.scatter(range(len(costs)),costs)
    plt.show()
    plt.scatter(range(len(r1)),r1,c='g')
    plt.show()
    plt.scatter(range(len(r2)), r2, c='r')
    plt.show()



In [6]:
def play():
    random.seed(int(time()))
    
    # load the RL model
    p1 = players.QPlayer([100,160,160,100], learning_batch_size=100, gamma=0.95, tau=0.95,
                         batches_to_q_target_switch=100, memory_size=100000)
    p1.restore('./models/q.ckpt')
    
    # player 2 is human
    p2 = players.Human()
    
    for g in range(4):
        print('STARTING NEW GAME (#{})\n-------------'.format(g))
        if g%2==0:
            game = Game(p1,p2)
            print("Computer is X (1)")
        else:
            game = Game(p2,p1)
            print("Computer is O (-1)")
        while not game.game_status()['game_over']:
            if isinstance(game.active_player(), players.Human):
                game.print_board()
                print("{}'s turn:".format(game.current_player))
            state = np.copy(game.board)
            # Force Q-Network to select different starting positions if it plays first
            action = int(game.active_player().select_cell(state,epsilon=0.0)) if np.count_nonzero(game.board) > 0 or not isinstance(game.active_player(),players.QPlayer) else random.randint(0,8)
            game.play(action)
            if not game.game_status()['game_over']:
                game.next_player()
        print('-------------\nGAME OVER!')
        game.print_board()
        print(game.game_status())
        print('-------------')

#train()

In [7]:
import numpy as np


class Game:
    """
    Tic-Tac-Toe game class
    """
    board = np.zeros(9)
    current_player = 1  # first player is 1, second player is -1
    player1 = None
    player2 = None

    _invalid_move_played = False

    def __init__(self, player1, player2,
                 winning_reward=1,
                 losing_reward=-1,
                 tie_reward=0,
                 invalid_move_reward=-10):
        self.player1 = player1
        self.player2 = player2
        self.player1.player_id = 1
        self.player2.player_id = -1
        self.winning_reward = winning_reward
        self.losing_reward = losing_reward
        self.invalid_move_reward = invalid_move_reward
        self.tie_reward = tie_reward
        self.reset()

    def reset(self):
        self.board = np.zeros(9)
        self.current_player = 1
        self._invalid_move_played = False

    def active_player(self):
        if self.current_player == 1:
            return self.player1
        else:
            return self.player2

    def inactive_player(self):
        if self.current_player == -1:
            return self.player1
        else:
            return self.player2

    def play(self, cell):
        self._invalid_move_played = False
        if self.board[cell] != 0:
            self._invalid_move_played = True
            return {'winner': 0,
                    'game_over': False,
                    'invalid_move': True}
        else:
            self.board[cell] = self.current_player
        status = self.game_status()
        return {'winner': status['winner'],
                'game_over': status['game_over'],
                'invalid_move': False}

    def next_player(self):
        if not self._invalid_move_played:
            self.current_player *= -1

    def game_status(self):
        winner = 0
        winning_seq = []
        winning_options = [[0,1,2],[3,4,5],[6,7,8],
                           [0,3,6],[1,4,7],[2,5,8],
                           [0,4,8],[2,4,6]]
        for seq in winning_options:
            s = self.board[seq[0]] + self.board[seq[1]] + self.board[seq[2]]
            if abs(s) == 3:
                winner = s/3
                winning_seq = seq
                break
        game_over = winner != 0 or len(list(filter(lambda z: z==0, self.board))) == 0
        return {'game_over': game_over, 'winner': winner,
                'winning_seq': winning_seq, 'board': self.board}

    def print_board(self):
        row = ' '
        status = self.game_status()
        for i in reversed(range(9)):
            if self.board[i] == 1:
                cell = 'x'
            elif self.board[i] == -1:
                cell = 'o'
            else:
                cell = ' '
            if status['winner'] != 0 and i in status['winning_seq']:
                cell = cell.upper()
            row += cell + ' '
            if i % 3 != 0:
                row += '| '
            else:
                row = row[::-1]
                if i != 0:
                    row += ' \n-----------'
                print(row)
                row = ' '


In [8]:
import random
import tensorflow as tf
from collections import deque


class QNetwork:
    """
    A Q-Network implementation
    """
    def __init__(self, input_size, output_size, hidden_layers_size, gamma):
        self.q_target = tf.placeholder(shape=(None, output_size), dtype=tf.float32)
        self.r = tf.placeholder(shape=None, dtype=tf.float32)
        self.states = tf.placeholder(shape=(None, input_size), dtype=tf.float32)
        self.actions = tf.placeholder(shape=(None, 2), dtype=tf.int32)  # enumerated actions
        self.learning_rate = tf.placeholder(shape=[], dtype=tf.float32)
        layer = self.states
        for l in hidden_layers_size:
            layer = tf.layers.dense(inputs=layer, units=l, activation=tf.nn.relu,
                                    kernel_initializer=tf.contrib.layers.xavier_initializer())
        self.output = tf.layers.dense(inputs=layer, units=output_size,
                                      kernel_initializer=tf.contrib.layers.xavier_initializer())
        self.predictions = tf.gather_nd(self.output, indices=self.actions)
        self.labels = self.r + (gamma * tf.reduce_max(self.q_target, axis=1))
        self.cost = tf.reduce_mean(tf.losses.mean_squared_error(labels=self.labels, predictions=self.predictions))
        self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(self.cost)


class ReplayMemory:
    """
    A cyclic Experience Replay memory buffer
    """
    memory = None
    counter = 0

    def __init__(self, size, seed=None):
        self.memory = deque(maxlen=size)
        if seed is not None:
            random.seed(seed)

    def __len__(self):
        return len(self.memory)

    def append(self, element):
        self.memory.append(element)
        self.counter += 1

    def sample(self, n, or_less=False):
        if or_less and n > self.counter:
            n = self.counter
        return random.sample(self.memory, n)


In [13]:
import random
import numpy as np
import tensorflow as tf
from abc import abstractmethod


class Player:
    """
    Base class for all player types
    """
    name = None
    player_id = None

    def __init__(self):
        pass

    def shutdown(self):
        pass

    def add_to_memory(self, add_this):
        pass

    @abstractmethod
    def select_cell(self, board, **kwargs):
        pass

    @abstractmethod
    def learn(self, **kwargs):
        pass


class Human(Player):
    """
    This player type allow a human player to play the game
    """
    def select_cell(self, board, **kwargs):
        cell = input("Select cell to fill:\n678\n345\n012\ncell number: ")
        return cell

    def learn(self, **kwargs):
        pass


class Drunk(Player):
    """
    Drunk player always selects a random valid move
    """
    def select_cell(self, board, **kwargs):
        available_cells = np.where(board == 0)[0]
        return random.choice(available_cells)

    def learn(self, **kwargs):
        pass


class Novice(Player):
    """
    A more sophisticated bot, which follows the following strategy:
    1) If it already has 2-in-a-row, capture the required cell for 3
    2) If not, and if the opponent has 2-in-a-row, capture the required cell to prevent hi, from winning
    3) Else, select a random vacant cell
    """
    def find_two_of_three(self, board, which_player_id):
        cell = None
        winning_options = [[0, 1, 2], [3, 4, 5], [6, 7, 8],
                           [0, 3, 6], [1, 4, 7], [2, 5, 8],
                           [0, 4, 8], [2, 4, 6]]
        random.shuffle(winning_options)
        for seq in winning_options:
            s = board[seq[0]] + board[seq[1]] + board[seq[2]]
            if s == 2 * which_player_id:
                a = np.array([board[seq[0]], board[seq[1]], board[seq[2]]])
                c = np.where(a == 0)[0][0]
                cell = seq[c]
                break
        return cell

    def select_cell(self, board, **kwargs):
        cell = self.find_two_of_three(board,self.player_id)
        if cell is None:
            cell = self.find_two_of_three(board,-self.player_id)
        if cell is None:
            available_cells = np.where(board == 0)[0]
            cell = random.choice(available_cells)
        return cell

    def learn(self, **kwargs):
        pass


class lazy_tensai(Player):
    """
    A reinforcement learning agent, based on Double Deep Q Network model
    This class holds two Q-Networks: `qnn` is the learning network, `q_target` is the semi-constant network
    """
    def __init__(self, hidden_layers_size, gamma):
        """
        :param hidden_layers_size: an array of integers, specifying the number of layers of the network and their size
        :param gamma: the Q-Learning discount factor
        :param learning_batch_size: training batch size
        :param batches_to_q_target_switch: after how many batches (trainings) should the Q-network be copied to Q-Target
        :param tau: a number between 0 and 1, determining how to combine the network and Q-Target when copying is performed
        :param memory_size: size of the memory buffer used to keep the training set
        """

        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.333)  
        self.session = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))

        self.qnn = dqn.QNetwork(9, 9, hidden_layers_size, gamma)
        self.session.run(tf.global_variables_initializer())
        super(lazy_tensai, self).__init__()

    def select_cell(self, board, **kwargs):
        rnd = random.random()
        eps = kwargs['epsilon']
        if rnd < eps:
            cell = random.randint(0,8)
            logging.debug("Choosing a random cell: %s [Epsilon = %s]", cell, eps)
        else:
            prediction = self.session.run(self.qnn.output,feed_dict={self.qnn.states: np.expand_dims(self.player_id * board, axis=0)})
            prediction = np.squeeze(prediction)
            cell = np.argmax(prediction)
            logging.debug("Predicting next cell - board: %s | player ID: %s | prediction: %s | cell: %s [Epsilon = %s]", board, prediction, cell, eps)
        return cell

    def restore(self, filename):
        saver = tf.train.Saver()
        saver.restore(self.session, filename)
        
    def learn(self, **kwargs):
        pass


class QPlayer(Player):
    """
    A reinforcement learning agent, based on Double Deep Q Network model
    This class holds two Q-Networks: `qnn` is the learning network, `q_target` is the semi-constant network
    """
    def __init__(self, hidden_layers_size, gamma, learning_batch_size,
                 batches_to_q_target_switch, tau, memory_size):
        """
        :param hidden_layers_size: an array of integers, specifying the number of layers of the network and their size
        :param gamma: the Q-Learning discount factor
        :param learning_batch_size: training batch size
        :param batches_to_q_target_switch: after how many batches (trainings) should the Q-network be copied to Q-Target
        :param tau: a number between 0 and 1, determining how to combine the network and Q-Target when copying is performed
        :param memory_size: size of the memory buffer used to keep the training set
        """
        self.learning_batch_size = learning_batch_size
        self.batches_to_q_target_switch = batches_to_q_target_switch
        self.tau = tau
        self.learn_counter = 0
        self.counter = 0

        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.333)  
        self.session = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))

        self.memory = dqn.ReplayMemory(memory_size)
        self.qnn = dqn.QNetwork(9, 9, hidden_layers_size, gamma)
        self.q_target = dqn.QNetwork(9, 9, hidden_layers_size, gamma)
        self.session.run(tf.global_variables_initializer())
        super(QPlayer, self).__init__()

    def select_cell(self, board, **kwargs):
        rnd = random.random()
        eps = kwargs['epsilon']
        self.counter += 1
        if rnd < eps:
            cell = random.randint(0,8)
            logging.debug("Choosing a random cell: %s [Epsilon = %s]", cell, eps)
        else:
            prediction = self.session.run(self.qnn.output,feed_dict={self.qnn.states: np.expand_dims(self.player_id * board, axis=0)})
            prediction = np.squeeze(prediction)
            cell = np.argmax(prediction)
            logging.debug("Predicting next cell - board: %s | player ID: %s | prediction: %s | cell: %s [Epsilon = %s]", board, prediction, cell, eps)
        return cell

    @staticmethod
    def _fetch_from_batch(batch, key, enum=False):
        if enum:
            return np.array(list(enumerate(map(lambda x: x[key], batch))))
        else:
            return np.array(list(map(lambda x: x[key], batch)))

    def learn(self, **kwargs):
        logging.debug('Memory counter = %s', self.memory.counter)
        self.learn_counter += 1
        if self.learn_counter % self.learning_batch_size != 0 or self.memory.counter < self.learning_batch_size:
            pass
        else:
            logging.debug('Starting learning procedure')
            batch = self.memory.sample(self.learning_batch_size)
            qt = self.session.run(self.q_target.output,feed_dict={self.q_target.states: self._fetch_from_batch(batch,'next_state')})
            terminals = self._fetch_from_batch(batch,'game_over')
            for i in range(terminals.size):
                if terminals[i]:
                    qt[i] = np.zeros(9)  # manually setting q-target values of terminal states to 0
            lr = kwargs['learning_rate']
            _, cost = self.session.run([self.qnn.optimizer, self.qnn.cost],
                                       feed_dict={self.qnn.states: self._fetch_from_batch(batch,'state'),
                                                  self.qnn.r: self._fetch_from_batch(batch,'reward'),
                                                  self.qnn.actions: self._fetch_from_batch(batch, 'action', enum=True),
                                                  self.qnn.q_target: qt,
                                                  self.qnn.learning_rate: lr})
            logging.info('Batch number: %s | Q-Network cost: %s | Learning rate: %s',
                         self.learn_counter % self.learning_batch_size, cost, lr)
            if self.memory.counter % (self.batches_to_q_target_switch * self.learning_batch_size) == 0:
                logging.info('Copying Q-Network to Q-Target')
                tf_vars = tf.trainable_variables()
                num_of_vars = len(tf_vars)
                operations = []
                for i,v in enumerate(tf_vars[0:num_of_vars//2]):
                    operations.append(tf_vars[i+num_of_vars//2].assign((v.value()*self.tau) + ((1-self.tau)*tf_vars[i+num_of_vars//2].value())))
                self.session.run(operations)
            return cost

    def add_to_memory(self, add_this):
        add_this['state'] = self.player_id * add_this['state']
        add_this['next_state'] = self.player_id * add_this['next_state']
        self.memory.append(add_this)

    def save(self, filename):
        saver = tf.train.Saver()
        saver.save(self.session, filename)

    def restore(self, filename):
        saver = tf.train.Saver()
        saver.restore(self.session, filename)

    def shutdown(self):
        self.session.close()



In [14]:
train()

NameError: name 'players' is not defined