## The Tic-Tac-Toe environment

The [Tic-Tac-Toe](https://github.com/MauroLuzzatto/OpenAI-Gym-TicTacToe-Environment) is a simple game environment that allows to train reinforcement learning agents.

In [16]:
from IPython.display import Image
Image(url='https://img.poki.com/cdn-cgi/image/quality=78,width=600,height=600,fit=cover,f=auto/85535e05d1f130b16751c8308cfbb19b.png', width=300)

In [17]:
# load the python modules
import time
import sys
import warnings

import gym
import numpy as np
from tqdm import tqdm
import gym_TicTacToe

from qagent import Qagent
from player import Player
from utils import (
    create_state_dictionary,
    load_qtable,
    play_tictactoe,
    reshape_state,
    save_qtable,
)

# ignore warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")

In [18]:
# initialize the tictactoe environment
env = gym.envs.make("TTT-v0", small=-1, large=10)

In [19]:
# get 10 randomly sampled actions
[env.action_space.sample() for ii in range(10)]

[5, 0, 0, 1, 7, 6, 3, 0, 2, 0]

In [20]:
env.reset()
print(env.render())

╒═══╤═══╤═══╕
│ - │ - │ - │
├───┼───┼───┤
│ - │ - │ - │
├───┼───┼───┤
│ - │ - │ - │
╘═══╧═══╧═══╛


In [21]:
color = 1
action = 4 

new_state, reward, done, _ = env.step((action, color))
print(new_state, reward, done)
print(env.render())


[[0 0 0]
 [0 1 0]
 [0 0 0]] -1 False
╒═══╤═══╤═══╕
│ - │ - │ - │
├───┼───┼───┤
│ - │ X │ - │
├───┼───┼───┤
│ - │ - │ - │
╘═══╧═══╧═══╛


In [22]:
state_dict = create_state_dictionary()
state_size = env.observation_space.n
action_size = env.action_space.n

Number of legal states: 8953


In [23]:
# set training parameters
episodes = 90_000  # 10**6 * 2
max_steps = 9

# name of the qtable when saved
load = False
save = True
test = True

num_test_games = 1

In [24]:
learning_parameters = {
    "learning_rate": 1.0, 
    "gamma": 0.9
}
exploration_parameters = {
    "max_epsilon": 1.0,
    "min_epsilon": 0.0,
    "decay_rate": 0.000005,

}

name = f"qtable_{episodes}"
folder = "tables"

qagent = Qagent(state_size, action_size, learning_parameters, exploration_parameters)

In [25]:

def play(player, state, action_space):

    action = qagent.get_action(state, action_space)

    # remove action from the action space
    action_space = action_space[action_space != action]

    new_state, reward, done, _ = env.step((action, player.color))
    new_state = state_dict[reshape_state(new_state)]

    qagent.qtable[state, action] = qagent.update_qtable(
        state, new_state, action, reward, done
    )
    # new state
    state = new_state
    player.add_reward(reward)
    return state, action_space, done

In [26]:
start_time = time.time()

player_1 = Player(color=1, episodes=episodes)
player_2 = Player(color=2, episodes=episodes)


for episode in tqdm(range(episodes)):
    state = env.reset()
    state = state_dict[reshape_state(state)]

    action_space = np.arange(9)

    player_1.reset_reward()
    player_2.reset_reward()

    # change start of players, randomly change the order players 
    # to start the game, integer either 0 or 1
    start = np.random.randint(2)

    for _step in range(start, max_steps + start):

        # alternate the moves of the players
        if _step % 2 == 0:
            state, action_space, done = play(player_1, state, action_space)
        else:
            state, action_space, done = play(player_2, state, action_space)

        if done == True:
            break

    # reduce epsilon for exporation-exploitation tradeoff
    qagent.update_epsilon(episode)
    player_1.save_reward(episode)
    player_2.save_reward(episode)

    if episode % 1_0000 == 0:

        sum_q_table = np.sum(qagent.qtable)
        time_passed = round((time.time() - start_time) / 60.0, 2)

        print(
            f"episode: {episode}, \
            epsilon: {round(qagent.epsilon, 2)}, \
            sum q-table: {sum_q_table}, \
            elapsed time [min]: {time_passed},  \
            done [%]: {episode / episodes * 100} \
            "
        )

  0%|          | 27/90000 [00:00<05:43, 262.14it/s]

episode: 0,             epsilon: 1.0,             sum q-table: 3.0,             elapsed time [min]: 0.0,              done [%]: 0.0             


 11%|█         | 10046/90000 [00:30<04:00, 333.08it/s]

episode: 10000,             epsilon: 0.01,             sum q-table: 39852.392319000006,             elapsed time [min]: 0.51,              done [%]: 11.11111111111111             


 22%|██▏       | 20035/90000 [01:04<04:11, 278.27it/s]

episode: 20000,             epsilon: 0.0,             sum q-table: 39970.420419,             elapsed time [min]: 1.07,              done [%]: 22.22222222222222             


 33%|███▎      | 30052/90000 [01:35<02:43, 367.76it/s]

episode: 30000,             epsilon: 0.0,             sum q-table: 39970.420419,             elapsed time [min]: 1.58,              done [%]: 33.33333333333333             


 44%|████▍     | 39412/90000 [02:05<02:30, 335.93it/s]

In [None]:
if save:
    save_qtable(qagent.qtable, folder, name)

In [None]:
# test the algorithm with playing against it
play_tictactoe(env, qagent.qtable, max_steps, state_dict)