In [4]:
from BaseAgent import BaseAgent
from BaseEnvironment import BaseEnvironment
from RLGlue import RLGlue
from Softmax import softmax
from Adam import Adam
from SimpleNN import SimpleNN

import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import pickle

In [5]:
class TicTacToeEvnironment(BaseEnvironment):
    def env_init(self, env_info={}):
        pass
    
    def env_start(self):
        self.terminal = False
        self.board = np.zeros((3, 3))
        self.reward_obs_term = (0, self.board, False)
        return self.board.reshape(1,-1), self.get_mask()
    
    def env_step(self, agent_num, index):
        if self.terminal:
            print("Environment in terminal state, please restart.")
        
        row, col = self.transform_index(index)
        self.board[row, col] = agent_num
        
        if self.check_won(agent_num):
            reward = 10
            self.terminal = True
        elif self.check_tie():
            reward = 0
            self.terminal = True
        else:
            reward = -1
            
        self.reward_obs_term_mask = (reward, self.board.reshape(1,-1), self.terminal, self.get_mask())
        return self.reward_obs_term_mask
    
    def check_tie(self):
        return (self.board == 0).sum() == 0
    
    def check_won(self, agent_num):
        for row in self.board:
            if np.array_equal(row, agent_num * np.ones((3,))):
                return True
        for col in self.board.T:
            if np.array_equal(col, agent_num * np.ones((3,))):
                return True
        diag = np.diag(self.board)
        if np.array_equal(diag, agent_num * np.ones((3,))):
            return True
        diag = np.diag(np.fliplr(self.board))
        if np.array_equal(diag, agent_num * np.ones((3,))):
            return True
        return False
    
    def env_cleanup(self):
        pass
    
    def env_message(self, message):
        if message == 0:  # return available indices mask
            return self.get_mask()
            
    def get_mask(self):
        rows, cols = np.where(self.board == 0)
        indices = rows * 3 + cols
        mask = np.zeros((9,))
        mask[indices] = 1
        return mask
    
    def transform_index(self, index):
        return index // 3, index % 3
    
    
class TicTacToeAgent(BaseAgent):
    def agent_init(self, agent_init_info):
        self.discount = agent_init_info["discount"]        
        self.network = agent_init_info["network"]
        self.optimizer = agent_init_info["optimizer"]
        self.tau = agent_init_info["tau"]
        self.num_actions = agent_init_info["num_actions"]
        
        self.rand_generator = np.random.RandomState(agent_init_info["seed"])
        
        self.last_state = None
        self.last_action = None

    def policy(self, state, mask):
        action_values = self.network.get_action_values(state)
        probs = softmax(action_values, self.tau) 
        probs *= mask
        probs /= probs.sum()
        action = self.rand_generator.choice(self.num_actions, p=probs.squeeze())
        return action

    def agent_start(self, state, mask):
        self.last_state = state
        self.last_action = self.policy(self.last_state, mask)
        return self.last_action        

    def agent_step(self, reward, state, mask):
        # SARSA
        action = self.policy(state, mask)
        self.network.get_action_values(state)
        delta = reward + self.discount * self.network.get_action_values(state)[action] - \
                    self.network.get_action_values(self.last_state)[self.last_action]        
        delta_mat = np.zeros((1,self.num_actions))
        delta_mat[0, self.last_action] = delta
        
        grads = self.network.get_gradients(self.last_state, delta_mat)
        self.optimizer.update_weights(self.network.get_weights(), grads)
        
        self.last_state = state
        self.last_action = action
        return action
    
    def agent_end(self, reward):
        # SARSA
        delta = reward - self.network.get_action_values(self.last_state)[self.last_action]        
        delta_mat = np.zeros((1,self.num_actions))
        delta_mat[0, self.last_action] = delta
        
        grads = self.network.get_gradients(self.last_state, delta_mat)
        self.optimizer.update_weights(self.network.get_weights(), grads)
        
    def agent_message(self, message):
        pass

In [44]:
fr = open("agent-1-network-weights",'rb')
weights = pickle.load(fr)
fr.close()

In [45]:
layers1 = [9, 6, 9]
nn = SimpleNN({"layer_sizes": layers1, "seed": 11})
nn.set_weights(weights)
optimizer_info = { "step_size": .1,
                  "beta_m": 0.99,
                  "beta_v": 0.999,
                  "epsilon": 0.0001 }
optimizer = Adam(layers1, optimizer_info)

In [46]:
# reduce tau to choose more optimal actions
agent_info = {"discount": 1, "network": nn, "optimizer": optimizer, "tau": 0.1, "num_actions": 9, "seed": 12}

In [47]:
ai = TicTacToeAgent()
ai.agent_init(agent_info)

In [48]:
env = TicTacToeEvnironment()
env.env_init()

In [49]:
state, mask = env.env_start()

In [50]:
# human
reward, state, ter, mask = env.env_step(1, 4)

In [51]:
action = ai.agent_start(state, mask)
action

5

In [52]:
# ai
reward, state, ter, mask = env.env_step(-1, action)
# human
reward, state, ter, mask = env.env_step(1, 0)

In [55]:
state

array([[ 1.,  0.,  0.,  0.,  1., -1.,  0.,  0.,  0.]])

In [54]:
ai.network.get_action_values(state)

array([-4.47968785, -6.5683146 , -3.30616794,  0.10791578,  0.51350736,
        0.38507136, -1.97239166, -5.88189067, -4.42064227])

In [53]:
action = ai.agent_start(state, mask)
action

3

In [30]:
# ai
reward, state, ter, mask = env.env_step(-1, action)
# human
reward, state, ter, mask = env.env_step(1, 3)

In [31]:
action = ai.agent_start(state, mask)
action

6

In [33]:
# ai
reward, state, ter, mask = env.env_step(-1, action)
# human
reward, state, ter, mask = env.env_step(1, 2)

In [36]:
action = ai.agent_start(state, mask)
action

8

In [39]:
# ai
reward, state, ter, mask = env.env_step(-1, action)
# human
reward, state, ter, mask = env.env_step(1, 1)

In [42]:
# human won!

In [43]:
ai.agent_end(-reward)