In [11]:
from TicTacToe import TicTacToe
env = TicTacToe()

In [12]:
env.board

array([[0, 0, 0],
       [0, 0, 0],
       [0, 0, 0]])

In [13]:
action = [1, 1]
observation, reward, terminated, truncated, info = env.step(action)


In [14]:
info

{'action_space': array([ 0,  1, -1,  0,  0,  0,  0,  0,  0])}

In [15]:
env.board

array([[ 0,  1, -1],
       [ 0,  0,  0],
       [ 0,  0,  0]])

In [16]:
env.action_space

array([ 0,  1, -1,  0,  0,  0,  0,  0,  0])

In [17]:
action = [1, 0]
observation, reward, terminated, truncated, info = env.step(action)

In [18]:
env.board

array([[ 1,  1, -1],
       [-1,  0,  0],
       [ 0,  0,  0]])

In [19]:
env.available_indices

[4, 5, 6, 7, 8]

In [20]:
action = [1, 3]
observation, reward, terminated, truncated, info = env.step(action)

ValueError: Invalid action: Position already occupied.

In [21]:
observation, reward, terminated, truncated, info

((np.int64(1),
  np.int64(1),
  np.int64(-1),
  np.int64(-1),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(0)),
 -1,
 False,
 False,
 {'action_space': array([ 1,  1, -1, -1,  0,  0,  0,  0,  0])})

In [22]:
reward, terminated

(-1, False)

In [23]:
obs, info = env.reset()

In [24]:
obs

(np.int64(0),
 np.int64(0),
 np.int64(0),
 np.int64(0),
 np.int64(0),
 np.int64(0),
 np.int64(0),
 np.int64(0),
 np.int64(0))

In [25]:
from collections import defaultdict
import numpy as np
from typing import Tuple

class Agent:
    def __init__(self, lr, initial_epsilon, epsilon_decay, min_epsilon, discout=0.95):
        self.q_values = defaultdict(lambda: np.zeros(env.action_space.shape))
        self.lr = lr
        self.epsilon = initial_epsilon
        self.epsilon_decay = epsilon_decay
        self.min_epsilon = min_epsilon
        self.discount_factor = discout

        self.training_error = []

    def get_action(self, obs, env):
        # if len(env.available_indices) >= 8 and env.available_indices[4] == 0:
        #     return [1, 4]
        if np.random.random() < self.epsilon:
            return [1, np.random.choice(env.available_indices)]

        else:
            q_values = self.q_values[obs]
            valid_actions = [(i, q_values[i]) for i in env.available_indices]
            action = max(valid_actions, key=lambda x: x[1])[0]
            return [1, action]

    def update(self, obs: Tuple[int], action, reward, terminated, next_obs: Tuple[int]):
        future_q_value = (not terminated) * np.max(self.q_values[next_obs])
        temporal_difference = reward + self.discount_factor * future_q_value - self.q_values[obs][action]

        self.q_values[obs][action] += self.lr * temporal_difference
        self.training_error.append(temporal_difference)

    def decay_epsilon(self):
        self.epsilon = max(self.min_epsilon, self.epsilon - self.epsilon_decay)

In [26]:
agent = Agent(lr=0.1, initial_epsilon=1, epsilon_decay=2e-6, min_epsilon=0.01)


In [27]:
n_episodes = 1000000
wins, epsilons = 0, []
epochs = 0


In [28]:
from tqdm import tqdm

wins = 0  # Initialize wins counter
epsilons = []  # List to store epsilon values over episodes

# Training loop
for episode in tqdm(range(n_episodes), desc="Training Episodes"):
    
    if episode % 2 == 0:
        obs, info = env.reset(player2_advantage=True)
    else: 
        obs, info = env.reset(player2_advantage=False)
    done = False

    while not done:
        action = agent.get_action(obs, env)
        next_obs, reward, terminated, truncated, info = env.step(action)

        done = terminated
        if done and reward == 1:
            wins += 1

        agent.update(obs, action, reward, terminated, next_obs)
        obs = next_obs

    epsilons.append(agent.epsilon)
    agent.decay_epsilon()

    if (episode + 1) % 50000 == 0:
        print(f"Episode: {episode + 1} - Epsilon: {agent.epsilon:.6f}")

# Final results
print(f"Total wins: {wins}")
win_ratio = (wins / (n_episodes)) * 100
print(f"Win ratio: {win_ratio:.2f}%")

Training Episodes:   5%|▌         | 50503/1000000 [00:17<05:10, 3062.07it/s]

Episode: 50000 - Epsilon: 0.900000


Training Episodes:  10%|█         | 100519/1000000 [00:33<05:28, 2741.61it/s]

Episode: 100000 - Epsilon: 0.800000


Training Episodes:  15%|█▌        | 150212/1000000 [00:49<05:10, 2739.23it/s]

Episode: 150000 - Epsilon: 0.700000


Training Episodes:  20%|██        | 200543/1000000 [01:05<03:58, 3354.52it/s]

Episode: 200000 - Epsilon: 0.600000


Training Episodes:  25%|██▌       | 250409/1000000 [01:20<03:34, 3491.52it/s]

Episode: 250000 - Epsilon: 0.500000


Training Episodes:  30%|███       | 300490/1000000 [01:35<03:18, 3526.03it/s]

Episode: 300000 - Epsilon: 0.400000


Training Episodes:  35%|███▌      | 350383/1000000 [01:49<04:38, 2331.76it/s]

Episode: 350000 - Epsilon: 0.300000


Training Episodes:  40%|████      | 400743/1000000 [02:03<02:34, 3874.37it/s]

Episode: 400000 - Epsilon: 0.200000


Training Episodes:  45%|████▌     | 450450/1000000 [02:16<03:23, 2705.23it/s]

Episode: 450000 - Epsilon: 0.100000


Training Episodes:  50%|█████     | 500446/1000000 [02:29<02:12, 3766.00it/s]

Episode: 500000 - Epsilon: 0.010000


Training Episodes:  55%|█████▌    | 550569/1000000 [02:41<01:47, 4183.50it/s]

Episode: 550000 - Epsilon: 0.010000


Training Episodes:  60%|██████    | 600997/1000000 [02:54<01:35, 4157.46it/s]

Episode: 600000 - Epsilon: 0.010000


Training Episodes:  65%|██████▌   | 650431/1000000 [03:05<01:23, 4178.46it/s]

Episode: 650000 - Epsilon: 0.010000


Training Episodes:  70%|███████   | 700567/1000000 [03:18<01:11, 4215.82it/s]

Episode: 700000 - Epsilon: 0.010000


Training Episodes:  75%|███████▌  | 750717/1000000 [03:30<00:59, 4196.79it/s]

Episode: 750000 - Epsilon: 0.010000


Training Episodes:  80%|████████  | 800422/1000000 [03:42<00:47, 4187.00it/s]

Episode: 800000 - Epsilon: 0.010000


Training Episodes:  85%|████████▌ | 850576/1000000 [03:54<00:36, 4141.88it/s]

Episode: 850000 - Epsilon: 0.010000


Training Episodes:  90%|█████████ | 900631/1000000 [04:06<00:23, 4201.55it/s]

Episode: 900000 - Epsilon: 0.010000


Training Episodes:  95%|█████████▌| 950544/1000000 [04:18<00:12, 3852.77it/s]

Episode: 950000 - Epsilon: 0.010000


Training Episodes: 100%|██████████| 1000000/1000000 [04:30<00:00, 3696.64it/s]

Episode: 1000000 - Epsilon: 0.010000
Total wins: 795035
Win ratio: 79.50%





In [31]:
wins = 0  # Initialize wins counter
epsilons = []  # List to store epsilon values over episodes

# Training loop
for episode in tqdm(range(10000), desc="Training Episodes"):
    obs, info = env.reset(player2_advantage=False)
    done = False

    while not done:
        action = agent.get_action(obs, env)
        next_obs, reward, terminated, truncated, info = env.step(action)

        done = terminated
        if done and reward == 1:
            wins += 1

        agent.update(obs, action, reward, terminated, next_obs)
        obs = next_obs

    epsilons.append(agent.epsilon)
    agent.decay_epsilon()

print(f"Total wins: {wins}")
win_ratio = (wins / (10000)) * 100
print(f"Win ratio: {win_ratio:.2f}%")

Training Episodes: 100%|██████████| 10000/10000 [00:02<00:00, 3893.66it/s]

Total wins: 9774
Win ratio: 97.74%





In [None]:
action = agent.get_action(obs, env)
next_obs, reward, terminated, truncated, info = env.step(action)

done = terminated

agent.update(obs, action, reward, terminated, next_obs)
obs = next_obs

env.display_interface()

[ 1  1 -1]
[1 0 0]
[ 1 -1  0]


In [None]:
env.display_interface()

[ 1  1 -1]
[1 0 0]
[ 1 -1  0]


In [None]:
len(agent.q_values)

3056

In [32]:
import sys
byte_size = sys.getsizeof(agent.q_values)

In [33]:
mb_size = byte_size / (1024 * 1024)  # Convert bytes to megabytes
print(f"Size of q_table is: {mb_size} MB")

Size of q_table is: 0.281341552734375 MB


In [None]:
obs, info = env.reset(player2_advantage=True)

action = agent.get_action(obs, env)
next_obs, reward, terminated, truncated, info = env.step(action, human_opponent=True)

done = terminated

agent.update(obs, action, reward, terminated, next_obs)
obs = next_obs

env.display_interface()

ValueError: invalid literal for int() with base 10: ''

In [None]:
a = np.array([1,2,3])

b = np.array([[1,2,3], 
             [1,2,3]])

a+b

array([[2, 4, 6],
       [2, 4, 6]])

In [34]:
q_table = dict(agent.q_values )

In [35]:
import pickle

with open("q_table.pkl", "wb") as f:
    pickle.dump(q_table, f)

In [None]:
a = [1, 2, 3]
b = a
del a  # Deletes the reference 'a', but the list still exists as 'b' references it
b

[1, 2, 3]

In [None]:
obs

(np.int64(-1),
 np.int64(1),
 np.int64(0),
 np.int64(0),
 np.int64(1),
 np.int64(-1),
 np.int64(0),
 np.int64(1),
 np.int64(-1))