In [11]:
from itertools import combinations
from collections import namedtuple
import numpy as np

Position = namedtuple('Position', ['x', 'o'])


## Enivironment Setup

In [12]:
class TicTacToe:
    def __init__(self):
        self.board = [2, 7, 6, 9, 5, 1, 4, 3, 8]
        self.players = ["X", "O"]
        self.current_state = Position(set(), set())
        self.current_player = None
        self.winner = None
        self.game_over = False

    def reset(self):
        self.board = [2, 7, 6, 9, 5, 1, 4, 3, 8]
        self.current_player = None
        self.current_state = Position(set(), set())
        self.winner = None
        self.game_over = False

    def available_moves(self):
        available = set(range(1, 9 + 1)) - self.current_state.x - self.current_state.o

        return available

    def make_move(self, move):
        if self.current_player == self.players[0]:
            self.current_state.x.add(move)
            self.game_over = self.win(self.current_state.x)
            if self.game_over:
               self.winner = self.players[0]
        else:
            self.current_state.o.add(move)
            self.game_over = self.win(self.current_state.o)
            self.winner = self.players[1]
            if self.game_over:
                self.winner = self.players[1]

        
        if len(self.available_moves()) == 0:
            self.game_over = True
            self.winner = "Tie"
            
        if not self.game_over:
            self.switch_player()

        return self.current_state

    def switch_player(self):
        if self.current_player == self.players[0]:
            self.current_player = self.players[1]
        else:
            self.current_player = self.players[0]

    def win(self, state):
        return any(sum(c) == 15 for c in combinations(state, 3))

    def state_value(self):
   
        if self.current_player == self.players[0] and self.win(self.current_state.x):
            return 1
        elif self.current_player == self.players[1] and self.win(self.current_state.o):
            return -1
        else:
            return 0.1

    def print_board(self):
        for r in range(3):
            print("-------------")
            for c in range(3):
                print("|", end="" " ")
                i = r * 3 + c
                if self.board[i] in self.current_state.x:
                    print("X", end="" " ")

                elif self.board[i] in self.current_state.o:
                    print("O", end="" " ")

                else:
                    print(" ", end="" " ")

            print("|", end="" " ")
            print()

        print("-------------")

In [13]:
import random

class QLearningAgent:
    def __init__(self, alpha, epsilon, discount_factor):
        self.Q = {}
        self.alpha = alpha
        self.epsilon = epsilon
        self.discount_factor = discount_factor
        self.counter = 0

    def get_Q_value(self, state, action):
        key = (frozenset(state[0]), frozenset(state[1]), action)
        if key not in self.Q:
            self.Q[key] = 0.0
        return self.Q[key]

    def choose_action(self, state, available_moves):
        if random.uniform(0, 1) < self.epsilon:
            return random.choice(list(available_moves))
        else:
            
            Q_values = [self.get_Q_value(state, action) for action in available_moves]
            max_Q = max(Q_values)
            if Q_values.count(max_Q) > 1:
                best_moves = [i for i in range(len(available_moves)) if Q_values[i] == max_Q]
                i = random.choice(best_moves)
            else:
                i = Q_values.index(max_Q)
            return list(available_moves)[i]

    def update_Q_value(self, state, action, reward, next_state, game):
        key = (frozenset(state[0]), frozenset(state[1]), action)
     
        next_Q_values = [self.get_Q_value(next_state, next_action) for next_action in game.available_moves()]
      
        max_next_Q = max(next_Q_values) if next_Q_values else 0.0
      
        if key not in self.Q:
            self.Q[key] = 0.0
            self.counter += 1
           
        
        self.Q[key] += self.alpha * (reward + self.discount_factor * max_next_Q - self.Q[key])

In [14]:
def train(num_episodes, alpha, epsilon, discount_factor):
    agent = QLearningAgent(alpha, epsilon, discount_factor)
    counter = 0
    for _ in range(num_episodes):
        game = TicTacToe()
        initial_state = Position(set(), set())
        game.current_state = initial_state
        
        
        while game.game_over == False:
            available_moves = game.available_moves()

            action = agent.choose_action(game.current_state, available_moves)

            next_state = game.make_move(action)
            counter += 1
            reward = game.state_value()
            # if reward != 0:
            #     print("reward", reward)
            agent.update_Q_value(game.current_state, action, reward, next_state, game)
    
    print("Number of moves", counter*num_episodes)
    print("Number of fail", agent.counter)
    return agent

In [15]:
def test(agent, num_games):
    num_wins_X = 0
    num_wins_O = 0

    for _ in range(num_games):
        new_game = TicTacToe()
        initial_state = Position(set(), set())
        new_game.current_state = initial_state
        while not new_game.game_over:
          
            if new_game.current_player == "X":
                action = agent.choose_action(
                    new_game.current_state, new_game.available_moves()
                )
            else:
                action = random.choice(list(new_game.available_moves()))

            next_state = new_game.make_move(action)
            new_game.current_state = next_state

        if new_game.winner == "X":
            num_wins_X += 1
        elif new_game.winner == "O":
            num_wins_O += 1

    print("X wins:", num_wins_X / num_games * 100)
    print("O wins: ", num_wins_O / num_games * 100)
    print("Tie: ", (num_games - num_wins_X - num_wins_O) / num_games * 100)


agent = train(num_episodes=1000, alpha=0.5, epsilon=0.1, discount_factor=1.0)
print(agent.Q)
test(agent, 1000)

Number of moves 7601000
Number of fail 4266
{(frozenset(), frozenset(), 1): 0.0, (frozenset(), frozenset(), 2): 0.0, (frozenset(), frozenset(), 3): 0.0, (frozenset(), frozenset(), 4): 0.0, (frozenset(), frozenset(), 5): 0.0, (frozenset(), frozenset(), 6): 0.0, (frozenset(), frozenset(), 7): 0.0, (frozenset(), frozenset(), 8): 0.0, (frozenset(), frozenset(), 9): 0.0, (frozenset(), frozenset({9}), 1): 0.0, (frozenset(), frozenset({9}), 2): 0.0, (frozenset(), frozenset({9}), 3): 0.0, (frozenset(), frozenset({9}), 4): 0.0, (frozenset(), frozenset({9}), 5): 0.0, (frozenset(), frozenset({9}), 6): 0.0, (frozenset(), frozenset({9}), 7): 0.0, (frozenset(), frozenset({9}), 8): 0.0, (frozenset(), frozenset({9}), 9): 0.1, (frozenset({3}), frozenset({9}), 1): 0.0, (frozenset({3}), frozenset({9}), 2): 0.0, (frozenset({3}), frozenset({9}), 4): 0.0, (frozenset({3}), frozenset({9}), 5): 0.0, (frozenset({3}), frozenset({9}), 6): 0.0, (frozenset({3}), frozenset({9}), 7): 0.0, (frozenset({3}), frozenset({