In [1]:
from Jogo import JogoDaVelha, play_game

In [2]:
import random
import numpy as np

class QLearningAgent:
    def __init__(self, epsilon=0.1, gamma=0.9, learning_rate=0.1):
        self.q_table = {}
        self.epsilon = epsilon
        self.gamma = gamma
        self.learning_rate = learning_rate

    def get_state_key(self, state):
        return tuple(map(tuple, state))

    def choose_action(self, state):
        state_key = self.get_state_key(state)
        if state_key not in self.q_table:
            self.q_table[state_key] = [0] * 9

        valid_moves = [i for i in range(9) if state[i // 3][i % 3] == 0]
        if not valid_moves:
            return random.choice(range(9))
            
        if random.random() < self.epsilon:
            return random.choice(valid_moves)
        else:
            q_values = [self.q_table[state_key][i] for i in valid_moves]
            max_q = max(q_values)
            best_options = [i for i, q in zip(valid_moves, q_values) if q ==max_q]
            return random.choice(best_options)

    def learn(self, state, action, reward, next_state):
        state_key = self.get_state_key(state)
        next_state_key = self.get_state_key(next_state)

        if state_key not in self.q_table:
            self.q_table[state_key] = [0] * 9
            
        if next_state_key not in self.q_table:
            self.q_table[next_state_key] = [0] * 9

        max_next_q = max(self.q_table[next_state_key])
        current_q = self.q_table[state_key][action]
        self.q_table[state_key][action] = current_q + self.learning_rate*(reward + self.gamma*max_next_q - current_q)

In [3]:
agent = QLearningAgent()

In [4]:
import numpy as np
import random
from tqdm import tqdm

num_episodes = 1000

for episode in tqdm(range(num_episodes), desc="Treinando"):
    game = JogoDaVelha()
    while not game.game_over:

        state = game.board
        action = agent.choose_action(state)
        game.make_move(action // 3, action % 3)

        next_state = game.board

        if game.winner == 1:
            reward = 1
        elif game.winner == 0:
            reward =0
        else:
            reward = -1

        agent.learn(state, action, reward, next_state)

Treinando: 100%|██████████| 1000/1000 [00:00<00:00, 2493.34it/s]


In [17]:
import numpy as np
import random
from tqdm import tqdm

num_episodes = 1000000

for episode in tqdm(range(num_episodes), desc="Treinando"):
    game = JogoDaVelha()

    if episode % 2 == 0:
        game.current_player = 1
    else:
        game.current_player = -1
    
    while not game.game_over:

        state = game.board.copy()
        if game.current_player == -1:
            action = agent.choose_action(state)
            game.make_move(action // 3, action % 3)
        else:
            valid_moves = [i for i in range(9) if game.board[i // 3][i % 3] == 0]
            action = random.choice(valid_moves)
            game.make_move(action // 3, action % 3)

        next_state = game.board.copy()
        if game.game_over:
            if game.winner == -1:
                reward = 1
            elif game.winner == 1:
                reward =-1
            else:
                reward = 0
        else:
            reward = 0

        agent.learn(state, action, reward, next_state)

Treinando: 100%|██████████| 1000000/1000000 [05:53<00:00, 2826.91it/s]


In [18]:
def print_q_table(q_table):
    for state, q_values in q_table.items():
        print(f"State: {state}")
        for action, q_value in enumerate(q_values):
            print(f"Action {action}: Q = {q_value}")
        print("-------")

print_q_table(agent.q_table)

State: ((0, 0, 0), (0, 0, 0), (0, 0, 0))
Action 0: Q = 0.592395004966719
Action 1: Q = 0.6469519752921151
Action 2: Q = 0.6164105766727532
Action 3: Q = 0.6428111283528557
Action 4: Q = 0.6108443041410916
Action 5: Q = 0.637786150404426
Action 6: Q = 0.6364048584684455
Action 7: Q = 0.6185245884592753
Action 8: Q = 0.6400248039602144
-------
State: ((1, 0, 0), (0, 0, 0), (0, 0, 0))
Action 0: Q = -0.9999825730661898
Action 1: Q = 0.6015159177276557
Action 2: Q = 0.657944060500502
Action 3: Q = 0.654270085019772
Action 4: Q = 0.6551489767843166
Action 5: Q = 0.6569572298479467
Action 6: Q = 0.656976792431962
Action 7: Q = 0.6543952677687875
Action 8: Q = 0.642074744553349
-------
State: ((1, 0, 0), (-1, 0, 0), (0, 0, 0))
Action 0: Q = 0
Action 1: Q = 0.6667374417659079
Action 2: Q = 0.6652936675739731
Action 3: Q = -0.814697981114816
Action 4: Q = 0.5903068331250887
Action 5: Q = 0.7524832385168309
Action 6: Q = 0.6800274179831922
Action 7: Q = 0.6752297736563324
Action 8: Q = 0.68419093

In [None]:
play_game(agent)

  |   |  
---------
  |   |  
---------
  |   |  
---------
