In [1]:
import numpy as np

## Environment

In [2]:
class TicTacToeEnv:
    """An environment for one-player tic-tac-toe."""
    
    def __init__(self, state=np.zeros((3, 3), dtype=np.int)):
        self.state = state

    def reset(self):
        self.state = np.zeros((3, 3), dtype=np.int)
        return self.state.copy()

    def step(self, action):
        assert self.state[action] != 1
        self.state[action] = 1
        return self.state.copy(), -1, self.done, {}

    def copy(self):
        copy = TicTacToeEnv()
        copy.state = self.state.copy()
        return copy
    
    def render(self):
        print(self.state)

    @property
    def actions(self):
        """The available actions for the current state."""
        if self.done:
            return []
        return [(a, b) for a in range(3) for b in range(3) if self.state[a, b] != 1]

    @property
    def done(self):
        """True if three in a row somewhere."""
        for row in range(3):
            if np.sum(self.state[row, :]) == 3:
                return True
        for col in range(3):
            if np.sum(self.state[:, col]) == 3:
                return True
        if np.sum(self.state[np.arange(3), np.arange(3)]) == 3:
            return True
        if np.sum(self.state[np.arange(3), np.arange(2, -1, -1)]) == 3:
            return True
        return False

## Agent

In [3]:
def epsilongreedy(node, epsilon):
    """Implements epsilon-greedy algorithm"""
    if np.random.rand() < epsilon:
        node = np.random.choice(node.children)
    else:
        node = max(node.children, key=lambda node: node.value)
    return node

In [4]:
def ucb(node, c):
    """Implements Upper Confidence Bound algorithm following Kocsis-Szepesvari"""
    p=np.log(node.visits)
    def v(node):
        if node.visits==0:
            return np.inf
        else:
            return node.value + c*np.sqrt(p/node.visits)
    return max(node.children, key=v)

In [5]:
class TreeNode:
    """A tree node for Monte Carlo tree search."""
    
    def __init__(self, parent, action, reward, done, env):
        self.parent = parent
        self.action = action
        self.reward = reward
        self.done = done
        self.env = env
        self.visits = 0
        self.value = 0
        self.children = []

class MCTSAgent:
    """A Monte Carlo tree search agent.

    Parameters
    ----------
    env_fn : function
        A function which maps states to new environments.
    epsilon : float, optional
        The fraction of uniform random choices in epsilon-greedy.
    rollots : int, optional
        The number of rollouts to perform before choosing an action.

    """
    
    def __init__(self, env_fn, epsilon=0.05, rollouts=100):
        self.env_fn = env_fn
        self.epsilon = epsilon
        self.rollouts = rollouts
    
    def act(self, state):
        env = self.env_fn(state)
        root = TreeNode(None, None, 0, False, env)
        for _ in range(self.rollouts):
            leaf = self.expand(root)
            value = self.simulate(leaf)
            self.backup(leaf, value)      
        return max(root.children, key=lambda node: node.value).action
    
    def expand(self, node):
        """Return an unvisited or terminal leaf node following epsilon-greedy.
        
        Before returning, this function performs all possible actions from the
        leaf node and adds new nodes for them to the tree as children of the
        leaf node.
        """
        while node.visits != 0 and len(node.children) > 0:
            """Could use epsilon-greedy/UCB"""
            """Choose value of constant here for Upper Confidence Bound algorithm"""
            node = ucb(node, 1)
        for action in node.env.actions:
            env = node.env.copy()
            state, reward, done, _ = env.step(action)
            node.children.append(TreeNode(node, action, reward, done, env))
        return node
    
    def simulate(self, node):
        """Return one total reward from node following uniform random policy."""
        env = node.env.copy()
        done = node.done
        total_reward = 0
        while not done:
            action = env.actions[np.random.choice(len(env.actions))]
            state, reward, done, _ = env.step(action)
            total_reward += reward
        return total_reward
    
    def backup(self, node, value):
        """Backup the return from a rollout from node."""
        while node != None:
            node.visits += 1
            node.value = (node.visits - 1)/node.visits * node.value + value/node.visits
            value += node.reward
            node = node.parent

## Testing

In [6]:
def run_episode(agent, env, render=False):
    """Run agent on env and return total reward."""
    state = env.reset()
    done = False
    total_reward = 0
    while not done:
        action = agent.act(state)
        state, reward, done, _ = env.step(action)
        total_reward += reward
        if render:
            env.render()
    return total_reward

In [7]:
agent = MCTSAgent(TicTacToeEnv, rollouts=1000)
env = TicTacToeEnv()
run_episode(agent, env, render=True)

[[0 0 0]
 [0 1 0]
 [0 0 0]]
[[0 0 0]
 [0 1 0]
 [0 1 0]]
[[0 1 0]
 [0 1 0]
 [0 1 0]]


-3

In [8]:
agent = MCTSAgent(TicTacToeEnv, rollouts=100)
env = TicTacToeEnv()
np.mean([run_episode(agent, env) for _ in range(100)])

-3.0