In [2]:
import gym
import gym_tictactoe_dassy
import random
import numpy as np

In [3]:
import os, sys
import time
import numpy as np
from tqdm import tqdm
from collections import defaultdict
import matplotlib.pyplot as plt
import pickle as pkl

In [6]:
import gym_tictactoe_dassy

In [8]:
env = gym.envs.make('tictactoe-v0')

Error: Attempted to look up malformed environment ID: b'tictactoe'. (Currently all IDs must be of the form ^(?:[\w:-]+\/)?([\w:.-]+)-v(\d+)$.)

In [11]:
class Qagent(object):
    """
    Implementation of a Q-learning Algorithm
    """
    
    def __init__(
        self, env, state_size, action_size, learning_parameters, exploration_parameters
    ):
        """
        initialize the q-learning agent
        Args:
          state_size (int): ..
          action_size (int): ..
          learning_parameters (dict):
          exploration_parameters (dict):

        """
        # init the Q-table
        self.qtable = np.zeros((state_size, action_size))

        # learning parameters
        self.learning_rate = learning_parameters["learning_rate"]
        self.gamma = learning_parameters["gamma"]

        # exploration parameters
        self.epsilon = exploration_parameters["epsilon"]
        self.max_epsilon = exploration_parameters["max_epsilon"]
        self.min_epsilon = exploration_parameters["min_epsilon"]
        self.decay_rate = exploration_parameters["decay_rate"]

        self.env = env

    def update_qtable(self, state, new_state, action, reward, done):
        """
        update the q-table: Q(s,a) = Q(s,a) + lr  * [R(s,a) + gamma * max Q(s',a') - Q (s,a)]
        Args:
          state (int): current state of the environment
          new_state (int): new state of the environment
          action (int): current action taken by agent
          reward (int): current reward received from env
          done (boolean): variable indicating if env is done
        Returns:
          qtable (array): the qtable containing a value for every state (y-axis) and action (x-axis)
        """
        return self.qtable[state, action] + self.learning_rate * (
            reward
            + self.gamma * np.max(self.qtable[new_state, :]) * (1 - done)
            - self.qtable[state, action]
        )

    def update_epsilon(self, episode):
        """
        reduce epsilon, exponential decay
        Args:
          episode (int): number of episode
        """
        self.epsilon = self.min_epsilon + (
            self.max_epsilon - self.min_epsilon
        ) * np.exp(-self.decay_rate * episode)

    def get_action(self, state, action_space):
        """
        select action e-greedy
        Args:
          state (int): current state of the environment/agent
          action_space (array): array with legal actions
        Returns:
          action (int): action that the agent will take in the next step
        """
        if random.uniform(0, 1) >= self.epsilon:
            # exploitation, max value for given state
            ranks = self.qtable[state, :].argsort().argsort()
            # get ranke of max value (min rank) from the action_space
            action = np.where(ranks == np.min(ranks[action_space]))[0][0]

        else:
            # exploration, random choice
            action = np.random.choice(action_space)  # self.env.action_space.sample()
        return action


In [22]:

import time

import numpy as np
import gym
import gym_tictactoe_dassy



state_dict = create_state_dictionary()


state_size = env.observation_space.n
action_size = env.action_space.n

player1 = 1
player2 = 2


learning_parameters = {"learning_rate": 1.0, "gamma": 0.9}

exploration_parameters = {
    "epsilon": 1.0,
    "max_epsilon": 1.0,
    "min_epsilon": 0.0,
    "decay_rate": 0.000001,
}

# set training parameters
episodes = 1000  # 10**6 * 2
max_steps = 9

# name of the qtable when saved
name = "qtable"
load = True
save = True
test = True

num_test_games = 1

player1_reward_array = np.zeros(episodes)
player2_reward_array = np.zeros(episodes)

# init the q-learning algorithm
qagent = Qagent(
    env, state_size, action_size, learning_parameters, exploration_parameters
)

if load:
    try:
        qagent.qtable = load_qtable(name)
        print("{}.npy loaded!".format(name))
    except:
        print("qtable could not be loaded!")


# TODO: Track the actions taken over time while playing,  9*8*7*6*5*4*3*2*1

# start the training
start_time = time.time()

for episode_i in range(episodes):
    state = env.reset()
    state = state_dict[reshape_state(state)]

    action_space = np.arange(9)

    # reset the reward of the players
    player1_reward = 0
    player2_reward = 0

    # change start of players, randomly change the order players to start the game
    start = np.random.randint(2)  # integer either 0 or 1

    for _step in range(start, max_steps + start):
        # alternate the moves of the players
        if _step % 2 == 0:

            # player 1
            action = qagent.get_action(state, action_space)

            # remove action from the action space
            action_space = action_space[action_space != action]

            new_state, reward, done, _ = env.step(action, player1)
            new_state = state_dict[reshape_state(new_state)]

            qagent.qtable[state, action] = qagent.update_qtable(
                state, new_state, action, reward, done
            )
            # new state
            state = new_state
            player1_reward += reward

        else:

            # player 2
            action = qagent.get_action(state, action_space)
            # remove action from the action space
            action_space = action_space[action_space != action]
            
            new_state, reward, done, _ = env.step(action, player2)
            new_state = state_dict[reshape_state(new_state)]

            qagent.qtable[state, action] = qagent.update_qtable(
                state, new_state, action, reward, done
            )

            # new state
            state = new_state
            player2_reward += reward

        # stopping criterion
        if done == True:
            break

    # reduce epsilon for exporation-exploitation tradeoff
    qagent.update_epsilon(episode_i)

    player1_reward_array[episode_i] = player1_reward
    player2_reward_array[episode_i] = player2_reward

    if episode_i % 100000 == 0:
        print("episode: {}, epsilon: {}".format(episode_i, round(qagent.epsilon, 2)))
        print(
            "elapsed time [min]: {}, done [%]: {}".format(
                round((time.time() - start_time) / 60.0, 2), episode_i / episodes * 100
            )
        )


if save:
    save_qtable(qagent.qtable, name)
    qtable = qagent.qtable

# test the algorithm with playing against it
if test:
    test_self_play_learning(env, qtable, max_steps, num_test_games, state_dict)

Number of legal states: 8953
qtable could not be loaded!


TypeError: step() takes 2 positional arguments but 3 were given