In [497]:
from collections import namedtuple
from itertools import combinations
import random
import numpy

from copy import deepcopy
from collections import defaultdict
from tqdm.auto import tqdm

MAGIC_SQUARE = [2, 7, 6, 
                9, 5, 1, 
                4, 3, 8]

In [498]:
State = namedtuple('State', ['x', 'o'])

In [508]:
def win(elements):
    return any(sum(c) == 15 for c in combinations(elements, 3))


def winning_state(pos: State):
    if win(pos.x):
        return 1
    elif win(pos.o):
        return -1
    else: 
        return 0


def available_moves(game_state: State):
    return [i for i in range(1, 10) if i not in game_state.x and i not in game_state.o]


def print_board(state: State):
    for row in range(3):
        for col in range(3):
            index = row * 3 + col
            if MAGIC_SQUARE[index] in state.x:
                print('x ', end='')
            elif MAGIC_SQUARE[index] in state.o:
                print('o ', end='')
            else:
                print('- ', end='')
        print()

## Q_learning_player
The Reinforced learning player is characterized by 3 factors that help it make choices and adapt
- *learning rate* determines the impact of newly acquired informations compared to the old ones: at 0 the system doesn't learn anything, at 1 the system only considers the last acquired information
- *discount factor* determines the impact of future rewards: at 0 the system only considers immediate reward (greedy), as the value increases the system considers more and more future rewards
- *exploration_rate* determines the chance the system will explore the environment rather than exploiting it (acts as a percentage chance)

The Q_learning_player decides every move wether to explore (make a random move) or exploit (make a move it knows the reward for). For exploiting it keeps a dictionary of all encountered game states, and to each state corresponds a list possible moves from that state and a list containing the associated rewards

In [501]:
class random_player():
    def make_move(self, game_state: State):
        return random.choice(available_moves(game_state))
    
    def name(self):
        return 'Random_player'


class Q_learning_player():
    def __init__(self, learning_rate, discount_factor, exploration_rate, role):
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.exploration_rate = exploration_rate
        self.learning_dictionary = defaultdict(int)
        self.role = role
        # because the rewards are set up to be positive if x wins and negative if o wins, 
        # the player must know if it's playing as x or o to know if it must maximize on minimize the rewards

    def make_move(self, game_state: State):
        hashable_state = (frozenset(game_state.x), frozenset(game_state.o))
        if random.uniform(0, 1) > self.exploration_rate and self.learning_dictionary[hashable_state] != 0:
            # exploit, aka choose a move based on what it has learned so far
            moves, rewards = self.learning_dictionary[hashable_state] # get a list of possible moves from the current state
            if self.role == 'x':
                best_move_index = rewards.index(max(rewards))
            else:
                best_move_index = rewards.index(min(rewards))
            return moves[best_move_index]
        else:
            # explore, aka make a new random move outside of what it has learned so far
            return random.choice(available_moves(game_state))

    def update(self, game_log, reward):
        for game_state, move, next_state in game_log: # for each move it took during the game

            hashable_state = (frozenset(game_state.x), frozenset(game_state.o))
            hashable_next_state = (frozenset(next_state.x), frozenset(next_state.o))
            existing_state = self.learning_dictionary[hashable_state]
            if existing_state == 0:
                self.learning_dictionary[hashable_state] = [[], []]

            previous_moves: list = self.learning_dictionary[hashable_state][0]
            if move in previous_moves:
                i = previous_moves.index(move)
                current_reward = self.learning_dictionary[hashable_state][1][i]
                if self.learning_dictionary[hashable_next_state] != 0:
                    optimal_next_reward = max(self.learning_dictionary[hashable_next_state][1])
                else:
                    optimal_next_reward = 0
                # self.learning_dictionary[hashable_state][1][i] = previous_reward * self.learning_rate + (reward - previous_reward) * self.discount_factor
                self.learning_dictionary[hashable_state][1][i] = current_reward * (1 - self.learning_rate) + self.learning_rate * (reward + self.discount_factor * optimal_next_reward)
            else:
                move_reward = reward * self.discount_factor
                self.learning_dictionary[hashable_state][0].append(move)
                self.learning_dictionary[hashable_state][1].append(move_reward)
        
    def name(self):
        return 'Q_learning_player'
    
    def data(self):
        return self.role, self.learning_rate, self.discount_factor, self.exploration_rate

For training we play the game and every time the learning system makes a move it saves the state and the move it took in the game_log for the learning process
After every game, if the player won or lost it gains a positive or negative reward associated with the moves it took at a given state (in case of a draw it learns nothing)

In [502]:
def training(players, epochs, learning_player):
    if learning_player == 'x':
        learning_player = 0
    if learning_player == 'o':
        learning_player = 1

    for _ in range(epochs):
        game_over = False
        game_log = []
        game_state = State(set(), set())
        player_turn = 0
        while not game_over:

            move = players[player_turn].make_move(game_state)
            if learning_player == player_turn:
                current_state = deepcopy(game_state)
            if player_turn == 0:
                game_state.x.add(move)
            else:
                game_state.o.add(move)
            if learning_player == player_turn:
                game_log.append((current_state, move, deepcopy(game_state)))

            if winning_state(game_state) or len(available_moves(game_state)) == 0:
                game_over = True
            
            player_turn = 1 - player_turn

        if winning_state(game_state):
            players[learning_player].update(game_log, winning_state(game_state))

This is a function to run a normal (non training) game, used to test the trained model

In [503]:
def game(players, show=False):
    game_state = State(set(), set())

    move_counter = 0
    while True:
        move = players[0].make_move(game_state)
        game_state.x.add(move)
        move_counter += 1
        if show:
            print(f'Player X, move {move_counter}')
            print_board(game_state)

        if winning_state(game_state):
            if show:
                print(f'player X won')
            return winning_state(game_state)

        if len(available_moves(game_state)) == 0:
            if show:
                print('Draw')
            return winning_state(game_state)

        move = players[0].make_move(game_state)
        game_state.o.add(move)
        move_counter += 1
        if show:
            print(f'Player O, move {move_counter}')
            print_board(game_state)

        if winning_state(game_state):
            if show:
                print(f'player O won')
            return winning_state(game_state)
    
    
def testing(players, n_tests, learning_player):
    if learning_player == 'x':
        learning_player = 0
    if learning_player == 'o':
        learning_player = 1

    wins = [0, 0, 0]
    for _ in range(n_tests):
        winner = game(players)
        if winner == 1:
            wins[0] += 1
        if winner == -1:
            wins[1] += 1
        if winner == 0:
            wins[2] += 1
    print(f'Final results out of {n_tests} games: \n\t{players[0].name()} win rate: {round(wins[0]/n_tests*100, 2)} %\n\t{players[1].name()} win rate {round(wins[1]/n_tests*100, 2)} %\n\tDraw rate {round(wins[2]/n_tests*100, 2)} %')
    return round(wins[learning_player]/n_tests*100, 2), players[learning_player].data()

Here I run some trials with different values for learning rate, discount rate and exploration rate to find the best combination

In [504]:

results_x = []
results_o = []
#lr, df, er = 0.1, 0.9, 0.1
for lr in [0.1, 0.5, 0.9]:
    for df in [0.1, 0.5, 0.9]:
        for er in [0.1, 0.5, 0.9]:
            for role in ['x', 'o']:
                print(f'learning_rate, discount_factor, exploration_rate = {lr}, {df}, {er}\nplaying as = {role}')
                player_1 = Q_learning_player(lr, df, er, role)
                player_2 = random_player()
                train_epochs = 100_000
                test_epochs = 1_000
                if role == 'x':
                    players = [player_1, player_2]
                elif role == 'o':
                    players = [player_2, player_1]

                training(players, train_epochs, role)

                win_rate, data = testing(players, test_epochs, role)
                if data[0] == 'x':
                    results_x.append([win_rate, data])
                if data[0] == 'o':
                    results_o.append([win_rate, data])
                print('\n')

print('----------------------------------------')
best = max(sorted(results_x, key=lambda x: x[0]))
print(f'best results when playing as X (1st player): {best[0]}')
print(f'obtained with learning_rate, discount_factor, exploration_rate = {best[1][1]}, {best[1][2]}, {best[1][3]}')

best = max(sorted(results_o, key=lambda x: x[0]))
print(f'best results when playing as O (2nd player): {best[0]}')
print(f'obtained with learning_rate, discount_factor, exploration_rate = {best[1][1]}, {best[1][2]}, {best[1][3]}')
            

learning_rate, discount_factor, exploration_rate = 0.1, 0.1, 0.1
playing as = x
Final results out of 1000 games: 
	Q_learning_player win rate: 89.2 %
	Random_player win rate 3.6 %
	Draw rate 7.2 %


learning_rate, discount_factor, exploration_rate = 0.1, 0.1, 0.1
playing as = o
Final results out of 1000 games: 
	Random_player win rate: 59.0 %
	Q_learning_player win rate 27.9 %
	Draw rate 13.1 %


learning_rate, discount_factor, exploration_rate = 0.1, 0.1, 0.5
playing as = x
Final results out of 1000 games: 
	Q_learning_player win rate: 80.4 %
	Random_player win rate 10.8 %
	Draw rate 8.8 %


learning_rate, discount_factor, exploration_rate = 0.1, 0.1, 0.5
playing as = o
Final results out of 1000 games: 
	Random_player win rate: 57.8 %
	Q_learning_player win rate 28.5 %
	Draw rate 13.7 %


learning_rate, discount_factor, exploration_rate = 0.1, 0.1, 0.9
playing as = x
Final results out of 1000 games: 
	Q_learning_player win rate: 62.8 %
	Random_player win rate 25.8 %
	Draw rate 11.4 %


Running some more in depth tests (more training epochs) for the best results we got above, just to make sure it wasn't a fluke

In [507]:
lr, df, er, role = 0.9, 0.9, 0.1, 'x'
print(f'learning_rate, discount_factor, exploration_rate = {lr}, {df}, {er}\nplaying as = {role}')
player_1 = Q_learning_player(lr, df, er, role)
player_2 = random_player()
train_epochs = 1_000_000
test_epochs = 100_000
if role == 'x':
    players = [player_1, player_2]
elif role == 'o':
    players = [player_2, player_1]

training(players, train_epochs, role)
testing(players, test_epochs, role)
print('\n')

learning_rate, discount_factor, exploration_rate = 0.9, 0.9, 0.1
playing as = x
Final results out of 100000 games: 
	Q_learning_player win rate: 87.6 %
	Random_player win rate 2.56 %
	Draw rate 9.84 %




## Conclusions
There clearly is some issue when going second (playing as o) but as of writing this I'm not sure what it is.
Because of the deadline I'll leave these results as they are but I'll continue on my own time to work on them to try and fix this issue

(my thoughts is there is some error in how I swap the objective of the learning player when I make it play as 'o' by having it minimize the reward instead of maximizing them)