In [41]:
import numpy as np
import random
import time
from collections import deque

## Environment

In [42]:
class TicTacToeEnv:
    """An environment for two-player tic-tac-toe."""

    def __init__(self):
        self.players = 2
        self.reset()

    def reset(self):
        """Initialize a new game."""
        self.board = np.zeros((3, 3), dtype=np.int)
        self.turn = 0
        self.done = False
        self.actions = [(a, b) for a in range(3) for b in range(3)]

    def step(self, action):
        """Perform action and return new state, rewards, done, and turn."""
        assert self.board[action] == 0
        self.board[action] = (-1) ** self.turn
        self.turn = (self.turn + 1) % 2
        winner = self.winner(action)
        if winner is not None:
            rewards = np.array([winner,(-1)*winner])
        else:
            rewards = np.array([0, 0])
        self.done = winner is not None or np.all(self.board != 0)
        if self.done:
            self.actions = []
        else:
            self.actions = [(a, b) for a in range(3) for b in range(3) if self.board[a, b] == 0]
        return self.board.copy(), rewards, self.done, self.turn

    def copy(self):
        copy = TicTacToeEnv()
        copy.board = self.board.copy()
        copy.turn = self.turn
        copy.done = self.done
        copy.actions = self.actions.copy()
        return copy

    def render(self):
        print(self.board)

    def winner(self,action):
        (r,c)=(self.rsum(action[0]),self.csum(action[1]))
        if r==3 or r==-3:
            return np.sign(r)
        if c==3 or c==-3:
            return np.sign(c)
        if (action[0]==action[1]):
            d=self.dsum()
            if d==3 or d==-3:
                return np.sign(d)
        if (sum(action)==2):
            a=self.adsum()
            if a==3 or a==-3:
                return np.sign(a)
        return None
    
    def rsum(self,r=0):
        return np.sum(self.board[r, :])
    
    def csum(self,c=0):
        return np.sum(self.board[:, c])
    
    def dsum(self):
        return np.sum(self.board[np.arange(3), np.arange(3)])
        
    def adsum(self):
        return np.sum(self.board[np.arange(3), np.arange(2, -1, -1)])

    def __eq__(self, other):
        return np.array_equal(self.board, other.board)

## Agent

In [43]:
def epsilon_greedy(epsilon=0.05):
    """Return an epsilon-greedy tree policy."""
    def policy(node):
        if random.random() < epsilon:
            return random.choice(node.children)
        else:
            return max(node.children, key=lambda n: n.value[node.env.turn])
    return policy


def ucb(c=np.sqrt(2)):
    """Return an upper confidence bound tree policy."""
    def policy(node):
        def v(n):
            if n.visits == 0:
                return np.inf
            else:
                return n.value[node.env.turn] + c * np.sqrt(np.log(node.visits)/n.visits)
        return max(node.children, key=v)
    return policy


class TreeNode:
    """A tree node for Monte Carlo tree search."""

    def __init__(self, parent, action, reward, env):
        self.parent = parent
        self.children = []
        self.action = action
        self.reward = reward
        self.env = env
        self.visits = 0
        self.value = np.zeros(env.players)


class MCTSAgent:
    """A Monte Carlo tree search agent.

    Parameters
    ----------
    tree_policy : function
        A function which maps node to child node.
    timeout : float, optional
        The amount of time in seconds to perform rollouts before choosing an action.

    """

    def __init__(self, tree_policy=ucb(), timeout=1.0):
        self.tree_policy = tree_policy
        self.timeout = timeout
        self.root = None

    def act(self, env):
        """Return a chosen action for the env.

        Parameters
        ----------
        env : environment
            The current environment.

        """
        self.root = self.find_root(env)
        limit = time.time() + self.timeout
        while time.time() < limit:
            leaf = self.expand(self.root)
            value = self.simulate(leaf)
            self.backup(leaf, value)
        return max(self.root.children, key=lambda node: node.visits).action
    
    def expand(self, node):
        """Return an unvisited or terminal leaf node following the tree policy.

        Before returning, this function performs all possible actions from the
        leaf node and adds new nodes for them to the tree as children of the
        leaf node.
        """
        while node.visits != 0 and len(node.children) > 0:
            node = self.tree_policy(node)
        if not node.env.done:
            for action in node.env.actions:
                env = node.env.copy()
                _, reward, _, _ = env.step(action)
                node.children.append(TreeNode(node, action, reward, env))
        return node

    def simulate(self, node):
        """Return one total reward from node following uniform random policy."""
        env = node.env.copy()
        total_rewards = np.zeros(env.players)
        while not env.done:
            action = random.choice(env.actions)
            _, rewards, _, _ = env.step(action)
            total_rewards += rewards
        return total_rewards

    def backup(self, node, value):
        """Backup the return from a rollout from node."""
        while node != None:
            value += node.reward
            node.visits += 1
            node.value = (node.visits - 1)/node.visits * node.value + value/node.visits
            node = node.parent

    def find_root(self, env):
        """Return node corresponding to env in current tree using BFS."""
        if self.root is not None:
            q = deque(self.root.children)
            while q:
                node = q.popleft()
                if node.env == env:
                    return node
                q.extend(node.children)
        return TreeNode(None, None, np.zeros(env.players), env)

In [44]:
class RandomAgent:
    """An agent that picks an action uniformly at random."""

    def act(self, env):
        return random.choice(env.actions)

In [45]:
class HumanAgent:
    """An agent controlled by a human player's input."""

    def act(self, env):
        indices = input('Input action: ').replace('(', '').replace(')', '').split(',')
        return tuple(int(x) for x in indices)

## Testing

In [46]:
def run_episode(agents, env, render=False):
    """Run agents on env and return total rewards."""
    env.reset()
    if render:
        env.render()
    total_reward = np.zeros(len(agents))
    while not env.done:
        action = agents[env.turn].act(env)
        _, rewards, _, _ = env.step(action)
        total_reward += rewards
        if render:
            env.render()
    return total_reward

In [47]:
env = TicTacToeEnv()
agents = [RandomAgent(), MCTSAgent(timeout=.1)]

In [48]:
%time returns = np.array([run_episode(agents, env) for _ in range(100)])

CPU times: user 33.5 s, sys: 55.4 ms, total: 33.6 s
Wall time: 33.5 s


In [49]:
(returns == 1).sum(axis=0)

array([ 2, 87])

In [50]:
env = TicTacToeEnv()
agents = [HumanAgent(), MCTSAgent(timeout=1.0)]

In [51]:
run_episode(agents, env, render=True)

[[0 0 0]
 [0 0 0]
 [0 0 0]]
Input action: (1,1)
[[0 0 0]
 [0 1 0]
 [0 0 0]]
[[-1  0  0]
 [ 0  1  0]
 [ 0  0  0]]
Input action: (2,2)
[[-1  0  0]
 [ 0  1  0]
 [ 0  0  1]]
[[-1  0 -1]
 [ 0  1  0]
 [ 0  0  1]]
Input action: (0,1)
[[-1  1 -1]
 [ 0  1  0]
 [ 0  0  1]]
[[-1  1 -1]
 [ 0  1  0]
 [ 0 -1  1]]
Input action: (1,0)
[[-1  1 -1]
 [ 1  1  0]
 [ 0 -1  1]]
[[-1  1 -1]
 [ 1  1 -1]
 [ 0 -1  1]]
Input action: (2,0)
[[-1  1 -1]
 [ 1  1 -1]
 [ 1 -1  1]]


array([0., 0.])