In [7]:
import numpy as np


In [25]:
import logging
import random
import numpy as np
import tensorflow as tf
from abc import abstractmethod


class Player():
    """
    Abstract player class
    """
    def __init__(self):
        self.player_id = None
        self.observation = None
        
    def reset(self, player_id):
        '''reset to the initial state
        '''
        self.player_id = player_id
        self.observation = None
        
    def observe(self, observation):
        '''
        receive raw observation from env and tune it if needed
        '''
        self.observation = tune_observation_view(observation, self.player_id)
        
    @staticmethod
    def tune_observation_view(observation, player_id):
        '''
        player_id either 1 or -1. Swap the observation such that 1 means self and -1 means the opponent 
        e.g. 
        if player_id = 1, no need to swap the view,
        input = array([ 1, -1, 0,
                        0,  0, 0,
                       -1, -1, 1 ])

        output = array([ 1, -1, 0,
                         0,  0, 0,
                        -1, -1, 1 ])

        if player_id = -1, need to swap the view,
        input = array([ 1, -1, 0,
                        0,  0, 0,
                       -1, -1, 1 ])

        output = array([-1, 1, 0,
                         0, 0, 0,
                         1, 1, -1 ])

        '''
        return observation * player_id

    def pick_action(self, observation):
        '''different players have different way to pick an action
        '''
        pass

    def memorize(self, add_this):
        '''some players will jot notes, some will not
        '''
        pass

    def learn(self, board, **kwargs):
        '''some players will study, some will not
        '''
        pass

    
class Human(Player):
    '''
    choose this player if you want to play the game
    '''
    def __init__(self):
        super(Human, self).__init__()
        
    def pick_action(self, board, **kwargs):
        cell = input('Pick a cell (top left is 0 and bottom right is 8): ')
        return cell

    
class Random_player(Player):
    """
    this player will pick random acion for all situation
    """
    def __init__(self):
        super(Random_player, self).__init__()
        
    def pick_action(self, is_action_available):
        possible_action_list = np.argwhere(is_action_available == 1).reshape([-1])
        return np.random.choice(possible_action_list, 1)[0]
    

class QPlayer(Player):
    """
    A reinforcement learning agent, based on Double Deep Q Network model
    This class holds two Q-Networks: `qnn` is the learning network, `q_target` is the semi-constant network
    """
    def __init__(self, hidden_layers_size, gamma, learning_batch_size,
                 batches_to_q_target_switch, tau, memory_size):
        """
        :param hidden_layers_size: an array of integers, specifying the number of layers of the network and their size
        :param gamma: the Q-Learning discount factor
        :param learning_batch_size: training batch size
        :param batches_to_q_target_switch: after how many batches (trainings) should the Q-network be copied to Q-Target
        :param tau: a number between 0 and 1, determining how to combine the network and Q-Target when copying is performed
        :param memory_size: size of the memory buffer used to keep the training set
        """
        self.learning_batch_size = learning_batch_size
        self.batches_to_q_target_switch = batches_to_q_target_switch
        self.tau = tau
        self.learn_counter = 0
        self.counter = 0

        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.333)  
        self.session = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))

        self.memory = dqn.ReplayMemory(memory_size)
        self.qnn = dqn.QNetwork(9, 9, hidden_layers_size, gamma)
        self.q_target = dqn.QNetwork(9, 9, hidden_layers_size, gamma)
        self.session.run(tf.global_variables_initializer())
        super(QPlayer, self).__init__()

    def select_cell(self, board, **kwargs):
        rnd = random.random()
        eps = kwargs['epsilon']
        self.counter += 1
        if rnd < eps:
            cell = random.randint(0,8)
            logging.debug("Choosing a random cell: %s [Epsilon = %s]", cell, eps)
        else:
            prediction = self.session.run(self.qnn.output,feed_dict={self.qnn.states: np.expand_dims(self.player_id * board, axis=0)})
            prediction = np.squeeze(prediction)
            cell = np.argmax(prediction)
            logging.debug("Predicting next cell - board: %s | player ID: %s | prediction: %s | cell: %s [Epsilon = %s]", board, prediction, cell, eps)
        return cell

    @staticmethod
    def _fetch_from_batch(batch, key, enum=False):
        if enum:
            return np.array(list(enumerate(map(lambda x: x[key], batch))))
        else:
            return np.array(list(map(lambda x: x[key], batch)))

    def learn(self, **kwargs):
        logging.debug('Memory counter = %s', self.memory.counter)
        self.learn_counter += 1
        if self.learn_counter % self.learning_batch_size != 0 or self.memory.counter < self.learning_batch_size:
            pass
        else:
            logging.debug('Starting learning procedure')
            batch = self.memory.sample(self.learning_batch_size)
            qt = self.session.run(self.q_target.output,feed_dict={self.q_target.states: self._fetch_from_batch(batch,'next_state')})
            terminals = self._fetch_from_batch(batch,'game_over')
            for i in range(terminals.size):
                if terminals[i]:
                    qt[i] = np.zeros(9)  # manually setting q-target values of terminal states to 0
            lr = kwargs['learning_rate']
            _, cost = self.session.run([self.qnn.optimizer, self.qnn.cost],
                                       feed_dict={self.qnn.states: self._fetch_from_batch(batch,'state'),
                                                  self.qnn.r: self._fetch_from_batch(batch,'reward'),
                                                  self.qnn.actions: self._fetch_from_batch(batch, 'action', enum=True),
                                                  self.qnn.q_target: qt,
                                                  self.qnn.learning_rate: lr})
            logging.info('Batch number: %s | Q-Network cost: %s | Learning rate: %s',
                         self.learn_counter % self.learning_batch_size, cost, lr)
            if self.memory.counter % (self.batches_to_q_target_switch * self.learning_batch_size) == 0:
                logging.info('Copying Q-Network to Q-Target')
                tf_vars = tf.trainable_variables()
                num_of_vars = len(tf_vars)
                operations = []
                for i,v in enumerate(tf_vars[0:num_of_vars//2]):
                    operations.append(tf_vars[i+num_of_vars//2].assign((v.value()*self.tau) + ((1-self.tau)*tf_vars[i+num_of_vars//2].value())))
                self.session.run(operations)
            return cost

    def add_to_memory(self, add_this):
        add_this['state'] = self.player_id * add_this['state']
        add_this['next_state'] = self.player_id * add_this['next_state']
        self.memory.append(add_this)

    def save(self, filename):
        saver = tf.train.Saver()
        saver.save(self.session, filename)

    def restore(self, filename):
        saver = tf.train.Saver()
        saver.restore(self.session, filename)

    def shutdown(self):
        self.session.close()



In [None]:
# work for random player and human first

class Player():
    """
    Abstract player class
    """
    def __init__(self):
        self.player_id = None
        self.observation = None
        
    def reset(self, player_id):
        '''reset to the initial state
        '''
        self.player_id = player_id
        self.observation = None
        
    def observe(self, observation):
        '''
        receive raw observation from env and tune it if needed
        '''
        self.observation = tune_observation_view(observation, self.player_id)
        
    @staticmethod
    def tune_observation_view(observation, player_id):
        '''
        player_id either 1 or -1. Swap the observation such that 1 means self and -1 means the opponent 
        e.g. 
        if player_id = 1, no need to swap the view,
        input = array([ 1, -1, 0,
                        0,  0, 0,
                       -1, -1, 1 ])

        output = array([ 1, -1, 0,
                         0,  0, 0,
                        -1, -1, 1 ])

        if player_id = -1, need to swap the view,
        input = array([ 1, -1, 0,
                        0,  0, 0,
                       -1, -1, 1 ])

        output = array([-1, 1, 0,
                         0, 0, 0,
                         1, 1, -1 ])

        '''
        return observation * player_id

    def pick_action(self, observation):
        '''different players have different way to pick an action
        '''
        pass

    def memorize(self, add_this):
        '''some players will jot notes, some will not
        '''
        pass

    def learn(self, board, **kwargs):
        '''some players will study, some will not
        '''
        pass

    
class Human(Player):
    '''
    choose this player if you want to play the game
    '''
    def __init__(self):
        super(Human, self).__init__()
        
    def pick_action(self, board, **kwargs):
        cell = input('Pick a cell (top left is 0 and bottom right is 8): ')
        return cell

    
class Random_player(Player):
    """
    this player will pick random acion for all situation
    """
    def __init__(self):
        super(Random_player, self).__init__()
        
    def pick_action(self, is_action_available):
        possible_action_list = np.argwhere(is_action_available == 1).reshape([-1])
        return np.random.choice(possible_action_list, 1)[0]
    


In [33]:
random_p1 = Random_player()
random_p2 = Random_player()

random_p1.pick_action(np.array([1,1,1,1,1,1,1,1,1]))

6

In [4]:
def process_memory(memory, is_terminal_state):
    if (len(memory) == memory_size + 1) and (is_terminal_state):
        # special handle
        pass
    
    # do all the shifting 
    for each record:
        if player is 2:
            swap observation, next_observation, reward
        
    pass

SyntaxError: invalid syntax (<ipython-input-4-ab283bd9a1f2>, line 7)

In [5]:
memory = np.array([])

next_observation, player_turn, reward, is_terminal_state = env.step(action)
memory.append([original_observation, action, reward, next_observation, player])

if (len(memory) == memory_size + 1) or (is_terminal_state):
    # shift reward, state, done, and assign 1 for my round and 2 for the opponent round
    memory_for_training = process_memory(memory, is_terminal_state)
    if is_terminal_state:
        memory = np.array([])
    else:
        memory = memory[-1]
        
    agent.learn(memory_for_training)

NameError: name 'env' is not defined

In [None]:
# main flow

load_weight_path = 'dummy_path'
model_weight_path = 'dummy_path'

env = make_env()
bot1 = Bot(load_weight_path)
bot2 = Bot()
bot_list = [bot1, bot2]
unprocess_memory = np.array([])

num_episode = 10


for episode in range(num_episode):
    env.reset()
    
    # get initial state
    observation, player_turn, reward, is_terminal_state = env.get_current_info()
    
    while not is_terminal_state:
        # if now is player 2's turn, swap the observation
        if player_turn == 2:
            swapped_observation = swap_observation_view(observation)
            action = bot_list[player_turn-1].select_action(swapped_observation)
        elif player_turn == 1:
            action = bot_list[player_turn-1].select_action(observation)
        else:
            print('Error in player turn. Current player turn is %s.\n' % player_turn)
        
        next_observation, next_player_turn, reward, is_terminal_state = env.step(action)
        
        unprocess_memory.append([observation, action, reward, next_observation, player_turn])
        
        if (len(unprocess_memory) == memory_size + 1) or (is_terminal_state):
            # shift reward, state, done, and assign 1 for my round and 2 for the opponent round
            memory_for_training = process_memory(unprocess_memory, is_terminal_state)
            if is_terminal_state:
                unprocess_memory = np.array([])
            else:
                unprocess_memory = unprocess_memory[-1]

            bot1.learn(memory_for_training)
            
        observation, player_turn = next_observation, next_player_turn
        
    # save the model weight periodically
    if (episode + 1) % 50 == 0:
        bot1.save_weight(model_weight_path)

In [None]:
class Bot():
    def __init__(self, load_weight_path=None):
        self.observation['board'] = np.zeros([9, 3])
        self.observation['1p_inventory'] = np.array([2, 2, 2])
        self.observation['2p_inventory'] = np.array([2, 2, 2])
                
        self.model = self.build_model()
        if load_weight_path is not None:
            self.model = self.load_weight()
        
        self
        pass
    
    def build_model(self):
        pass
    
    def load_model(self):
        pass
    
    def choose_action(self, observation):
        pass

In [None]:
bot1 = Bot()
bot2 = Bot()

player_list = [bot1, bot2]
save_list = []

for episode in range(100):
    # reset environment
    env.reset()
    
    # start from the 1st player
    player_turn = 1
    observation = env.observation
    while game not end:
        action = player_list[state['player_turn']].pick_action(observation)
        
        # if the action is not valid, the program will be break      
        state, next_observation, reward = env.step(action)
        
        save_list.append([observation, action, next_observation, reward])
        observation = next_observation
        

        
        

In [10]:
np.random.choice(b, 1)

ValueError: 'a' must be 1-dimensional