In [215]:
import gym
import gym_tictactoe
import numpy as np

In [216]:
def checkRows(board):
    for row in board:
        if len(set(row)) == 1:
            return row[0]
    return 0

def checkDiagonals(board):
    if len(set([board[i][i] for i in range(len(board))])) == 1:
        return board[0][0]
    if len(set([board[i][len(board)-i-1] for i in range(len(board))])) == 1:
        return board[0][len(board)-1]
    return 0

def checkWin(board):
    #transposition to check rows, then columns
    for newBoard in [board, np.transpose(board)]:
        result = checkRows(newBoard)
        if result:
            return result
    return checkDiagonals(board)

## Basic adversary for the RL algorithm to compete against

In [252]:
class ImperfectAgent():
    def policy(self, state):
        best_action = None
        
        if state[4] is 0:
            best_action = 4
        elif state[0] is 0:
            best_action = 0
        elif state[2] is 0:
            best_action = 2
        elif state[6] is 0:
            best_action = 6
        elif state[8] is 0:
            best_action = 8
        elif state[1] is 0:
            best_action = 1
        elif state[3] is 0:
            best_action = 3
        elif state[5] is 0:
            best_action = 5
        elif state[7] is 0:
            best_action = 7
        else:
            print('invalid state')
            raise "invalid state"
            
        return np.eye(9)[best_action]

In [253]:
env = gym.make('TicTacToe-v1')
env.init(symbols=[1, 2]) # Define users symbols

In [254]:
adversary = ImperfectAgent()

In [255]:
original_state = env.reset()

## RL algorithm based agent using temporal-difference (TD) learning

In [256]:
from sympy.utilities.iterables import multiset_permutations

In [272]:
class TDLearningAgent():
    def __init__(self, playerNum=2, oppNum=1, lr=0.2):
        self.valueFunctionTable = np.full((19683,), 0.5)
        self.valuePredictionHistories = []
        self.player = playerNum
        self.opponent = oppNum
        self.learningRate = lr
        
        for index, _ in enumerate(self.valueFunctionTable):
            winner = checkWin( np.array(self.num_to_state(index)).reshape( (3,3) ) )
            if winner is self.player:
                self.valueFunctionTable[index] = 1
            elif winner is self.opponent:
                self.valueFunctionTable[index] = 0
        
    
    def state_to_num(self, state):
        assert len(state) == 9
        return np.ravel_multi_index(state, (3, 3, 3, 3, 3, 3, 3, 3, 3))
    
    def num_to_state(self, num):
        return np.unravel_index(num, (3, 3, 3, 3, 3, 3, 3, 3, 3))
        
    def value_from_state(self, state):
        curr_index = self.state_to_num(state)
        return self.valueFunctionTable[curr_index]
        
    def train(self):
        self.valuePredictionHistories.pop()
        
    def policy(self, state):
        # state = [0, 0, 0, 1, 2, 1, 0, 0, 0]
        
        current_value_prediction = self.value_from_state(state)
        
        value_predictions = []
        
        for i, element in enumerate(state):
            foward_state = state.copy()
            if element is 0:
                foward_state[i] = self.player
                value_prediction = self.value_from_state(foward_state)
                value_predictions.append(value_prediction)
            else:
                value_predictions.append(-1)
                
        greedy_action = np.argmax(value_predictions) 
        
        self.valuePredictionHistories.append((current_value_prediction, value_predictions[greedy_action]))
        
        return np.eye(9)[greedy_action]

In [273]:
Imperfect = ImperfectAgent()
TDLearner = TDLearningAgent()

for gameNum in range(1000):
    done = False
    state = env.reset()
    curr_action = 1
    while not done:
        if curr_action == 1:
            decision = Imperfect.policy(state)
        else:
            decision = TDLearner.policy(state)
        
        actionIndex = np.random.choice(9, 1, p=decision)[0]
                
        _state, reward, done, info = env.step(actionIndex, curr_action)
        
        if (done):
            
            winner = checkWin(np.array(_state).reshape((3,3)))
            print(winner)
        
        state = _state
        
        if curr_action == 1:
            curr_action = 2
        else:
            curr_action = 1
            
    TDLearner.train()

1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
