In [6]:
import random
from collections import namedtuple
from itertools import combinations
from random import seed

Position = namedtuple("Position", ["x", "o"])

seed(40)

In [7]:
class TicTacToe:
    def __init__(self, playerX, playerO, human_game=False):
        self.board = [2, 7, 6, 9, 5, 1, 4, 3, 8]
        self.current_board = Position(set(), set())
        self.playerX, self.playerO = playerX, playerO
        self.playerX_turn = random.choice([True, False])
        self.winner = None
        self.human_game = human_game

    def play_game(self):
        self.playerX.start_game("X")
        self.playerO.start_game("O")
        while True:
            player, char, other_player = (
                (self.playerX, "X", self.playerO)
                if self.playerX_turn
                else (self.playerO, "O", self.playerX)
            )
            
            if self.human_game:
                print(f"Player {char} move")
                self.print_board_info()
                
            move = player.move(self.current_board)
            
            moves = self.current_board.x if self.playerX_turn else self.current_board.o
            moves.add(move)
            if self.human_game:
                self.print_board()

            if self.win(moves):
                player.reward(1, self.current_board)
                other_player.reward(-1, self.current_board)
                self.winner = char
                break

            if self.board_full():  # tie game
                player.reward(0.5, self.current_board)
                other_player.reward(0.5, self.current_board)
                self.winner = None
                break

            other_player.reward(0, self.current_board)
            self.playerX_turn = not self.playerX_turn

    def win(self, state):
        return any(sum(c) == 15 for c in combinations(state, 3))

    def board_full(self):
        player = self.playerX if self.playerX_turn else self.playerO
        return player.available_moves(self.current_board) == set()

    def print_board(self):
        for r in range(3):
            print("-------------")
            for c in range(3):
                i = r * 3 + c
                char = " "
                if self.board[i] in self.current_board.x:
                    char = "X"
                elif self.board[i] in self.current_board.o:
                    char = "O"
                print(f"| {char}", end=" ")
            print("|")
        print("-------------")

    def print_board_info(self):
        for r in range(3):
            print("-------------")
            for c in range(3):
                i = r * 3 + c

                print(f"| {self.board[i]}", end=" ")
            print("|")
        print("-------------")

In [8]:
class Player(object):
    def __init__(self):
        self.name = "human"

    def start_game(self, char):
        print("\nNew game!")

    def move(self, current_board):
       
        move = int(input("Your move? "))
        if move not in self.available_moves(current_board):
            print("Illegal move.")
            move = self.move(current_board)
        return move

    def reward(self, value, current_board):
        print("{} rewarded: {}".format(self.name, value))

    def available_moves(self, current_board):
        available = set(range(1, 9 + 1)) - current_board.x - current_board.o

        return available


class RandomPlayer(Player):
    def __init__(self):
        self.name = "random"

    def reward(self, value, board):
        pass

    def start_game(self, char):
        pass

    def move(self, current_board):
        available = self.available_moves(current_board)

        return random.choice(list(available))


class QLearningPlayer(Player):
    def __init__(self, epsilon=0.2, alpha=0.3, gamma=0.9):
        self.name = "Qlearner"
        self.harm_humans = False
        self.q = {}  # (state, action) keys: Q values
        self.epsilon = epsilon  # e-greedy chance of random exploration
        self.alpha = alpha  # learning rate
        self.gamma = gamma  # discount factor for future rewards

    def start_game(self, char):
        self.last_board = (set(), set())
        self.last_move = None

    def getQ(self, state, action):
        # encourage exploration; "optimistic" 1.0 initial values
        if self.q.get((state, action)) is None:
            self.q[(state, action)] = 1.0
        return self.q.get((state, action))

    def move(self, current_board):
        self.last_board = (
            tuple(current_board.x),
            tuple(current_board.o),
        )  # Convert Position to tuple
        actions = list(self.available_moves(self.last_board))

        if random.random() < self.epsilon:  # explore!
            self.last_move = random.choice(list(actions))
            return self.last_move

        qs = [self.getQ(self.last_board, a) for a in actions]
        maxQ = max(qs)

        if qs.count(maxQ) > 1:
            # more than 1 best option; choose among them randomly
            best_options = [i for i in range(len(actions)) if qs[i] == maxQ]
            i = random.choice(best_options)
        else:
            i = qs.index(maxQ)

        self.last_move = actions[i]
        return actions[i]

    def reward(self, value, current_board):
        if self.last_move:
            self.learn(
                self.last_board,
                self.last_move,
                value,
                (tuple(current_board[0]), tuple(current_board[1])),
            )

    def learn(self, state, action, reward, result_state):
        prev = self.getQ(state, action)
        maxqnew = max([self.getQ(result_state, a) for a in self.available_moves(state)])
        self.q[(state, action)] = prev + self.alpha * (
            (reward + self.gamma * maxqnew) - prev
        )

    def available_moves(self, current_board):
        available = set(range(1, 9 + 1)) - set(current_board[0]) - set(current_board[1])
        return available

In [9]:
def trained_agent(p1, agent, num_games=200000):
    for i in range(0, num_games):
        t = TicTacToe(p1, agent)
        t.play_game()
    return agent


agent = trained_agent(RandomPlayer(), QLearningPlayer())
p1 = RandomPlayer()
agent.epsilon = 0

num_X = 0
num_O = 0
num_ties = 0
for _ in range(100):
    t = TicTacToe(p1, agent)
    t.play_game()
    
    if t.winner is "X":
        num_X += 1
    elif t.winner is "O":
        num_O += 1
    else:
        num_ties += 1

print("X wins: " + str(num_X))
print("O wins: " + str(num_O))
print("Ties: " + str(num_ties))






X wins: 2
O wins: 94
Ties: 4


  if t.winner is "X":
  elif t.winner is "O":


In [10]:
human = Player()
t = TicTacToe(human, agent, human_game=True)
t.print_board_info()
t.print_board()

t.play_game()
print("Winner is: " + str(t.winner))
t.print_board()

-------------
| 2 | 7 | 6 |
-------------
| 9 | 5 | 1 |
-------------
| 4 | 3 | 8 |
-------------
-------------
|   |   |   |
-------------
|   |   |   |
-------------
|   |   |   |
-------------

New game!
Player O move
-------------
| 2 | 7 | 6 |
-------------
| 9 | 5 | 1 |
-------------
| 4 | 3 | 8 |
-------------
-------------
| O |   |   |
-------------
|   |   |   |
-------------
|   |   |   |
-------------
human rewarded: 0
Player X move
-------------
| 2 | 7 | 6 |
-------------
| 9 | 5 | 1 |
-------------
| 4 | 3 | 8 |
-------------
-------------
| O |   |   |
-------------
|   |   |   |
-------------
|   |   | X |
-------------
Player O move
-------------
| 2 | 7 | 6 |
-------------
| 9 | 5 | 1 |
-------------
| 4 | 3 | 8 |
-------------
-------------
| O |   |   |
-------------
|   |   | O |
-------------
|   |   | X |
-------------
human rewarded: 0
Player X move
-------------
| 2 | 7 | 6 |
-------------
| 9 | 5 | 1 |
-------------
| 4 | 3 | 8 |
-------------
-------------
|