In [3]:
# Pi(s,a)
from dataclasses import dataclass
from typing import Dict
import numpy as np


In [4]:
Policy = Dict[int, Dict[int, float]]

# V(s)
ValueFunction = Dict[int, float]

# Q(s,a)
ActionValueFunction = Dict[int, Dict[int, float]]


# Pi(s,a) and V(s)
@dataclass
class PolicyAndValueFunction:
    pi: Policy
    v: ValueFunction


# Pi(s,a) and Q(s,a)
@dataclass
class PolicyAndActionValueFunction:
    pi: Policy
    q: ActionValueFunction


In [5]:

class MDPEnv:
    def states(self) -> np.ndarray:
        pass

    def actions(self) -> np.ndarray:
        pass

    def rewards(self) -> np.ndarray:
        pass

    def is_state_terminal(self, s: int) -> bool:
        pass

    def transition_probability(self, s: int, a: int, s_p: int, r: float) -> float:
        pass

    def view_state(self, s: int):
        pass


class SingleAgentEnv:
    def state_id(self) -> int:
        pass

    def is_game_over(self) -> bool:
        pass

    def act_with_action_id(self, action_id: int):
        pass

    def score(self) -> float:
        pass

    def available_actions_ids(self) -> np.ndarray:
        pass

    def reset(self):
        pass

    def view(self):
        pass

    def reset_random(self):
        pass


class DeepSingleAgentWithDiscreteActionsEnv:
    def state_description(self) -> np.ndarray:
        pass

    def state_description_length(self) -> int:
        pass

    def max_actions_count(self) -> int:
        pass

    def is_game_over(self) -> bool:
        pass

    def act_with_action_id(self, action_id: int):
        pass

    def score(self) -> float:
        pass

    def available_actions_ids(self) -> np.ndarray:
        pass

    def reset(self):
        pass

In [4]:
class LineWorld(MDPEnv):
    def __init__(self, cells_count: int):
        self.cells_count = cells_count
        self.__states = np.arange(self.cells_count)
        self.__actions = np.array([0, 1])
        self.__rewards = np.array([-1, 0, 1])
        self.probality = self.probality_setup()

    def probality_setup(self):
        p = np.zeros((len(self.__states), len(self.__actions), len(self.__states), len(self.__rewards)))
        for s in range(1, self.cells_count - 2):
            p[s, 1, s + 1, 1] = 1.0

        for s in range(2, self.cells_count - 1):
            p[s, 0, s - 1, 1] = 1.0

        p[self.cells_count - 2, 1, self.cells_count - 1, 2] = 1.0
        p[1, 0, 0, 0] = 1.0
        return p

    def states(self) -> np.ndarray:
        return self.__states

    def actions(self) -> np.ndarray:
        return self.__actions

    def rewards(self) -> np.ndarray:
        return self.__rewards

    def is_state_terminal(self, s: int) -> bool:
        return s == self.cells_count - 1 or s == 0

    def transition_probability(self, s: int, a: int, s_p: int, r: float) -> float:
        return self.probality[s, a, s_p, r]

    def view_state(self, s: int):
        pass



In [5]:

class GridWorld(MDPEnv):
    def __init__(self, lines: int, columns: int):
        self.lines = lines
        self.columns = columns
        self.cells_count = lines * columns
        self.negative_terminal = 4
        self.positive_terminal = self.cells_count - 1
        self.__states = np.arange(self.cells_count)
        self.__actions = np.array([0, 1, 2, 3])
        self.__rewards = np.array([-1, 0, 1])
        self.probality = self.probality_setup()

    def probality_setup(self):
        p = np.zeros((len(self.__states), len(self.__actions), len(self.__states), len(self.__rewards)))
        for line in range(0, self.lines):
            for column in range(0, self.columns - 1):
                s = line * self.columns + column
                if s != self.negative_terminal and s != self.positive_terminal:
                    if s + 1 == self.positive_terminal:
                        p[s, 1, s + 1, 2] = 1.0
                    elif s + 1 == self.negative_terminal:
                        p[s, 1, s + 1, 0] = 1.0
                    else:
                        p[s, 1, s + 1, 1] = 1.0

            for column in range(1, self.columns):
                s = line * self.columns + column
                if s != self.positive_terminal and s != self.negative_terminal:
                    if s - 1 == self.negative_terminal:
                        p[s, 0, s - 1, 0] = 1.0
                    elif s - 1 == self.positive_terminal:
                        p[s, 0, s - 1, 2] = 1.0
                    else:
                        p[s, 0, s - 1, 1] = 1.0

        for column in range(0, self.columns):
            for line in range(0, self.lines - 1):
                s = self.columns * line + column
                s2 = self.columns * (line + 1) + column
                # up
                if s2 != self.positive_terminal and s2 != self.negative_terminal:
                    if s == self.negative_terminal:
                        p[s2, 2, s, 0] = 1.0
                    elif s == self.positive_terminal:
                        p[s2, 2, s, 2] = 1.0
                    else:
                        p[s2, 2, s, 1] = 1.0

                # down
                if s != self.negative_terminal and s != self.positive_terminal:
                    if s2 == self.positive_terminal:
                        p[s, 3, s2, 2] = 1.0
                    elif s2 == self.negative_terminal:
                        p[s, 3, s2, 0] = 1.0
                    else:
                        p[s, 3, s2, 1] = 1.0
        return p

    def states(self) -> np.ndarray:
        return self.__states

    def actions(self) -> np.ndarray:
        return self.__actions

    def rewards(self) -> np.ndarray:
        return self.__rewards

    def is_state_terminal(self, s: int) -> bool:
        return s == self.positive_terminal or s == self.negative_terminal

    def transition_probability(self, s: int, a: int, s_p: int, r: float) -> float:
        return self.probality[s, a, s_p, r]

    def view_state(self, s: int):
        pass

# Dynamic programming 

In [9]:

def policy_evaluation_on_line_world() -> ValueFunction:
    """
    Creates a Line World of 7 cells (leftmost and rightmost are terminal, with -1 and 1 reward respectively)
    Launches a Policy Evaluation Algorithm in order to find the Value Function of a uniform random policy
    Returns the Value function (V(s)) of this policy
    """
    env = LineWorld(7)
    pi = np.ones((len(env.states()), len(env.actions())))
    pi /= len(env.actions())

    theta = 0.00001
    gamma = 1.0

    V = np.zeros((len(env.states()),))
    while True:
        delta = 0
        for s in env.states():
            old_v = V[s]
            V[s] = 0.0
            for a in env.actions():
                for s_next in env.states():
                    for r_idx, r in enumerate(env.rewards()):
                        V[s] += pi[s, a] * env.transition_probability(s, a, s_next, r_idx) * (r + gamma * V[s_next])
            delta = max(delta, abs(V[s] - old_v))

        if delta < theta:
            break

    return dict(enumerate(V.flatten(), 1))


def policy_iteration_on_line_world() -> PolicyAndValueFunction:
    """
    Creates a Line World of 7 cells (leftmost and rightmost are terminal, with -1 and 1 reward respectively)
    Launches a Policy Iteration Algorithm in order to find the Optimal Policy and its Value Function
    Returns the Policy (Pi(s,a)) and its Value Function (V(s))
    """
    env = LineWorld(7)
    V = np.zeros((len(env.states()),))
    pi = np.ones((len(env.states()), len(env.actions())))
    pi /= len(env.actions())

    theta = 0.00001
    gamma = 1.0

    while True:
        while True:
            delta = 0
            for s in env.states():
                old_v = V[s]
                V[s] = 0.0
                for a in env.actions():
                    for s_next in env.states():
                        for r_idx, r in enumerate(env.rewards()):
                            V[s] += pi[s, a] * env.transition_probability(s, a, s_next, r_idx) * (r + gamma * V[s_next])
                delta = max(delta, abs(V[s] - old_v))

            if delta < theta:
                break

        policy_stable = True
        for s in env.states():
            old_policy = pi[s, :]

            best_a = None
            best_a_value = None
            for a in env.actions():
                a_value = 0
                for s_p in env.states():
                    for r_idx, r in enumerate(env.rewards()):
                        a_value += env.transition_probability(s, a, s_p, r_idx) * (r + gamma * V[s_p])
                if best_a_value is None or best_a_value < a_value:
                    best_a_value = a_value
                    best_a = a

            pi[s, :] = 0.0
            pi[s, best_a] = 1.0
            if not np.array_equal(pi[s], old_policy):
                policy_stable = False

        if policy_stable:
            break

    final_pi = {}
    for indice, value in enumerate(pi):
        final_pi[indice] = dict(enumerate(value.flatten(), 1))

    return PolicyAndValueFunction(final_pi, dict(enumerate(V.flatten(), 1)))


def value_iteration_on_line_world() -> PolicyAndValueFunction:
    """
    Creates a Line World of 7 cells (leftmost and rightmost are terminal, with -1 and 1 reward respectively)
    Launches a Value Iteration Algorithm in order to find the Optimal Policy and its Value Function
    Returns the Policy (Pi(s,a)) and its Value Function (V(s))
    """
    env = LineWorld(7)
    V = np.zeros((len(env.states()),))
    pi = np.ones((len(env.states()), len(env.actions())))
    pi /= len(env.actions())
    pi2 = pi.copy()

    theta = 0.00001
    gamma = 1.0

    while True:
        delta = 0
        for s in env.states():
            old_v = V[s]
            V[s] = 0.0
            best_a_value = None
            best_a = None
            for a in env.actions():
                a_value = 0
                for s_p in env.states():
                    for r_idx, r in enumerate(env.rewards()):
                        pre_a_value = env.transition_probability(s, a, s_p, r_idx) * (r + gamma * V[s_p])
                        a_value += pre_a_value
                        V[s] += pi[s, a] * pre_a_value
                if best_a_value is None or best_a_value < a_value:
                    best_a_value = a_value
                    best_a = a

            delta = max(delta, abs(V[s] - old_v))
            pi2[s, :] = 0.0
            pi2[s, best_a] = 1.0

        if delta < theta:
            break

    final_pi = {}
    for indice, value in enumerate(pi2):
        final_pi[indice] = dict(enumerate(value.flatten(), 1))

    return PolicyAndValueFunction(final_pi, dict(enumerate(V.flatten(), 1)))


def policy_evaluation_on_grid_world() -> ValueFunction:
    """
    Creates a Grid World of 5x5 cells (upper rightmost and lower rightmost are terminal, with -1 and 1 reward respectively)
    Launches a Policy Evaluation Algorithm in order to find the Value Function of a uniform random policy
    Returns the Value function (V(s)) of this policy
    """
    env = GridWorld(5, 5)
    pi = np.ones((len(env.states()), len(env.actions())))
    pi /= len(env.actions())

    theta = 0.00001
    gamma = 1.0

    V = np.zeros((len(env.states()),))
    while True:
        delta = 0
        for s in env.states():
            old_v = V[s]
            V[s] = 0.0
            for a in env.actions():
                for s_next in env.states():
                    for r_idx, r in enumerate(env.rewards()):
                        V[s] += pi[s, a] * env.transition_probability(s, a, s_next, r_idx) * (r + gamma * V[s_next])
            delta = max(delta, abs(V[s] - old_v))

        if delta < theta:
            break

    return dict(enumerate(V.flatten(), 1))


def policy_iteration_on_grid_world() -> PolicyAndValueFunction:
    """
    Creates a Grid World of 5x5 cells (upper rightmost and lower rightmost are terminal, with -1 and 1 reward respectively)
    Launches a Policy Iteration Algorithm in order to find the Optimal Policy and its Value Function
    Returns the Policy (Pi(s,a)) and its Value Function (V(s))
    """
    env = GridWorld(5, 5)
    V = np.zeros((len(env.states()),))
    pi = np.ones((len(env.states()), len(env.actions())))
    pi /= len(env.actions())

    theta = 0.00001
    gamma = 1.0

    while True:
        while True:
            delta = 0
            for s in env.states():
                old_v = V[s]
                V[s] = 0.0
                for a in env.actions():
                    for s_next in env.states():
                        for r_idx, r in enumerate(env.rewards()):
                            V[s] += pi[s, a] * env.transition_probability(s, a, s_next, r_idx) * (r + gamma * V[s_next])
                delta = max(delta, abs(V[s] - old_v))

            if delta < theta:
                break

        policy_stable = True
        for s in env.states():
            old_policy = pi[s, :]

            best_a = None
            best_a_value = None
            for a in env.actions():
                a_value = 0
                for s_p in env.states():
                    for r_idx, r in enumerate(env.rewards()):
                        a_value += env.transition_probability(s, a, s_p, r_idx) * (r + gamma * V[s_p])
                if best_a_value is None or best_a_value < a_value:
                    best_a_value = a_value
                    best_a = a

            pi[s, :] = 0.0
            pi[s, best_a] = 1.0
            if not np.array_equal(pi[s], old_policy):
                policy_stable = False

        if policy_stable:
            break

    final_pi = {}
    for indice, value in enumerate(pi):
        final_pi[indice] = dict(enumerate(value.flatten(), 1))

    return PolicyAndValueFunction(final_pi, dict(enumerate(V.flatten(), 1)))


def value_iteration_on_grid_world() -> PolicyAndValueFunction:
    """
    Creates a Grid World of 5x5 cells (upper rightmost and lower rightmost are terminal, with -1 and 1 reward respectively)
    Launches a Value Iteration Algorithm in order to find the Optimal Policy and its Value Function
    Returns the Policy (Pi(s,a)) and its Value Function (V(s))
    """
    env = GridWorld(5, 5)
    V = np.zeros((len(env.states()),))
    pi = np.ones((len(env.states()), len(env.actions())))
    pi /= len(env.actions())
    pi2 = pi.copy()

    theta = 0.00001
    gamma = 1.0

    while True:
        delta = 0
        for s in env.states():
            old_v = V[s]
            V[s] = 0.0
            best_a_value = None
            best_a = None
            for a in env.actions():
                a_value = 0
                for s_p in env.states():
                    for r_idx, r in enumerate(env.rewards()):
                        pre_a_value = env.transition_probability(s, a, s_p, r_idx) * (r + gamma * V[s_p])
                        a_value += pre_a_value
                        V[s] += pi[s, a] * pre_a_value
                if best_a_value is None or best_a_value < a_value:
                    best_a_value = a_value
                    best_a = a

            delta = max(delta, abs(V[s] - old_v))
            pi2[s, :] = 0.0
            pi2[s, best_a] = 1.0

        if delta < theta:
            break

    final_pi = {}
    for indice, value in enumerate(pi2):
        final_pi[indice] = dict(enumerate(value.flatten(), 1))

    return PolicyAndValueFunction(final_pi, dict(enumerate(V.flatten(), 1)))

"""
def policy_evaluation_on_secret_env1() -> ValueFunction:
    env = Env1()
    pi = np.ones((len(env.states()), len(env.actions())))
    pi /= len(env.actions())

    theta = 0.00001
    gamma = 1.0

    V = np.zeros((len(env.states()),))
    while True:
        delta = 0
        for s in env.states():
            old_v = V[s]
            V[s] = 0.0
            for a in env.actions():
                for s_next in env.states():
                    for r_idx, r in enumerate(env.rewards()):
                        V[s] += pi[s, a] * env.transition_probability(s, a, s_next, r_idx) * (r + gamma * V[s_next])
            delta = max(delta, abs(V[s] - old_v))

        if delta < theta:
            break

    return dict(enumerate(V.flatten(), 1))


def policy_iteration_on_secret_env1() -> PolicyAndValueFunction:
    env = Env1()
    V = np.zeros((len(env.states()),))
    pi = np.ones((len(env.states()), len(env.actions())))
    pi /= len(env.actions())

    theta = 0.00001
    gamma = 1.0

    while True:
        while True:
            delta = 0
            for s in env.states():
                old_v = V[s]
                V[s] = 0.0
                for a in env.actions():
                    for s_next in env.states():
                        for r_idx, r in enumerate(env.rewards()):
                            V[s] += pi[s, a] * env.transition_probability(s, a, s_next, r_idx) * (r + gamma * V[s_next])
                delta = max(delta, abs(V[s] - old_v))

            if delta < theta:
                break

        policy_stable = True
        for s in env.states():
            old_policy = pi[s, :]

            best_a = None
            best_a_value = None
            for a in env.actions():
                a_value = 0
                for s_p in env.states():
                    for r_idx, r in enumerate(env.rewards()):
                        a_value += env.transition_probability(s, a, s_p, r_idx) * (r + gamma * V[s_p])
                if best_a_value is None or best_a_value < a_value:
                    best_a_value = a_value
                    best_a = a

            pi[s, :] = 0.0
            pi[s, best_a] = 1.0
            if not np.array_equal(pi[s], old_policy):
                policy_stable = False

        if policy_stable:
            break

    final_pi = {}
    for indice, value in enumerate(pi):
        final_pi[indice] = dict(enumerate(value.flatten(), 1))

    return PolicyAndValueFunction(final_pi, dict(enumerate(V.flatten(), 1)))


def value_iteration_on_secret_env1() -> PolicyAndValueFunction:
    env = Env1()
    V = np.zeros((len(env.states()),))
    pi = np.ones((len(env.states()), len(env.actions())))
    pi /= len(env.actions())
    pi2 = pi.copy()

    theta = 0.00001
    gamma = 1.0

    while True:
        delta = 0
        for s in env.states():
            old_v = V[s]
            V[s] = 0.0
            best_a_value = None
            best_a = None
            for a in env.actions():
                a_value = 0
                for s_p in env.states():
                    for r_idx, r in enumerate(env.rewards()):
                        pre_a_value = env.transition_probability(s, a, s_p, r_idx) * (r + gamma * V[s_p])
                        a_value += pre_a_value
                        V[s] += pi[s, a] * pre_a_value
                if best_a_value is None or best_a_value < a_value:
                    best_a_value = a_value
                    best_a = a

            delta = max(delta, abs(V[s] - old_v))
            pi2[s, :] = 0.0
            pi2[s, best_a] = 1.0

        if delta < theta:
            break

    final_pi = {}
    for indice, value in enumerate(pi2):
        final_pi[indice] = dict(enumerate(value.flatten(), 1))

    return PolicyAndValueFunction(final_pi, dict(enumerate(V.flatten(), 1)))
"""


'\ndef policy_evaluation_on_secret_env1() -> ValueFunction:\n    env = Env1()\n    pi = np.ones((len(env.states()), len(env.actions())))\n    pi /= len(env.actions())\n\n    theta = 0.00001\n    gamma = 1.0\n\n    V = np.zeros((len(env.states()),))\n    while True:\n        delta = 0\n        for s in env.states():\n            old_v = V[s]\n            V[s] = 0.0\n            for a in env.actions():\n                for s_next in env.states():\n                    for r_idx, r in enumerate(env.rewards()):\n                        V[s] += pi[s, a] * env.transition_probability(s, a, s_next, r_idx) * (r + gamma * V[s_next])\n            delta = max(delta, abs(V[s] - old_v))\n\n        if delta < theta:\n            break\n\n    return dict(enumerate(V.flatten(), 1))\n\n\ndef policy_iteration_on_secret_env1() -> PolicyAndValueFunction:\n    env = Env1()\n    V = np.zeros((len(env.states()),))\n    pi = np.ones((len(env.states()), len(env.actions())))\n    pi /= len(env.actions())\n\n    t

## Result fort Dynamic programming 

In [10]:
    print(policy_evaluation_on_line_world())
    print(policy_iteration_on_line_world())
    print(value_iteration_on_line_world())

    print(policy_evaluation_on_grid_world())
    print(policy_iteration_on_grid_world())
    print(value_iteration_on_grid_world())
"""
    print(policy_evaluation_on_secret_env1())
    print(policy_iteration_on_secret_env1())
    print(value_iteration_on_secret_env1())
"""

{1: 0.0, 2: -0.666686199082779, 3: -0.3333626319575018, 4: -2.9298624168505594e-05, 5: 0.33331135936520695, 6: 0.6666556796826035, 7: 0.0}
PolicyAndValueFunction(pi={0: {1: 1.0, 2: 0.0}, 1: {1: 0.0, 2: 1.0}, 2: {1: 0.0, 2: 1.0}, 3: {1: 0.0, 2: 1.0}, 4: {1: 0.0, 2: 1.0}, 5: {1: 0.0, 2: 1.0}, 6: {1: 1.0, 2: 0.0}}, v={1: 0.0, 2: -0.666686199082779, 3: -0.3333626319575018, 4: -2.9298624168505594e-05, 5: 0.33331135936520695, 6: 0.6666556796826035, 7: 0.0})
PolicyAndValueFunction(pi={0: {1: 1.0, 2: 0.0}, 1: {1: 0.0, 2: 1.0}, 2: {1: 0.0, 2: 1.0}, 3: {1: 0.0, 2: 1.0}, 4: {1: 0.0, 2: 1.0}, 5: {1: 0.0, 2: 1.0}, 6: {1: 1.0, 2: 0.0}}, v={1: 0.0, 2: -0.666686199082779, 3: -0.3333626319575018, 4: -2.9298624168505594e-05, 5: 0.33331135936520695, 6: 0.6666556796826035, 7: 0.0})
{1: -0.012408241005487585, 2: -0.03849208733039875, 3: -0.10946152767210533, 4: -0.3206577559932595, 5: 0.0, 6: -0.011130448543747447, 7: -0.032086147618454595, 8: -0.07868611730403527, 9: -0.1731645441465791, 10: -0.2932932689

'\nprint(policy_evaluation_on_secret_env1())\nprint(policy_iteration_on_secret_env1())\nprint(value_iteration_on_secret_env1())\n'

In [11]:
def max_dict(d):
  # returns the argmax (key) and max (value) from a dictionary
  max_key = None
  max_val = float('-inf')
  for k, v in d.items():
    if v > max_val:
      max_val = v
      max_key = k
  return max_key, max_val


## Tictactoe Environment

In [8]:
def init_tic_tac_toe_dict():
    dict = {}
    all_possible_states = 9
    for s in range(all_possible_states):
        dict[s] = {}
        for a in range(all_possible_states):
            dict[s][a] = 0
    return dict


class TicTacToe(SingleAgentEnv):
    def __init__(self):
        self.cases = [-1] * 9
        self.game_state = 0
        self.game_over = False
        self.player_turn = True
        self.player_value = 1
        self.random_player_value = 0
        self.current_score = 0.0
        self.reset()

    def state_id(self) -> int:
        sum = 0
        available_actions_size = 2
        for i in range(len(self.cases)):
            case = self.cases[i]
            if case == self.player_value:
                sum += pow(available_actions_size, i)
            elif case == self.random_player_value:
                sum += pow(available_actions_size, len(self.cases) + i)
        return sum

    def is_game_over(self) -> bool:
        return self.game_over

    def act_with_action_id(self, action_id: int):
        if self.cases[action_id] != -1:
            print(self.cases)
            print(action_id)
            print(self.available_actions_ids())
        assert (action_id < len(self.cases))
        assert (self.cases[action_id] == -1)
        assert (not self.game_over)

        if self.player_turn:
            self.cases[action_id] = self.player_value
        else:
            self.cases[action_id] = self.random_player_value

        self.player_turn = not self.player_turn
        self.game_state = self.state_id()

        if self.tictactoe_ended(self.player_value):
            self.game_over = True
            self.current_score = 1.0
        elif self.tictactoe_ended(self.random_player_value):
            self.game_over = True
            self.current_score = -1.0
        elif -1 not in self.cases:
            self.game_over = True
            self.current_score = 0.0
        elif not self.player_turn:
            rand = random.randint(0, 8)
            while self.cases[rand] != -1:
                rand = random.randint(0, 8)
            self.act_with_action_id(rand)

    def score(self) -> float:
        return self.current_score

    def available_actions_ids(self) -> np.ndarray:
        if self.game_over:
            return np.array([], dtype=np.int)
        available_actions = []
        for i in range(len(self.cases)):
            if self.cases[i] == -1:
                available_actions.append(i)
        return np.array(available_actions)

    def reset(self):
        self.game_over = False
        self.current_score = 0.0
        self.game_state = 0
        self.player_turn = True
        self.cases = [-1] * 9

    def line_checked(self, cases) -> bool:
        return (0 in cases and 1 in cases and 2 in cases) or \
               (3 in cases and 4 in cases and 5 in cases) or \
               (6 in cases and 7 in cases and 8 in cases)

    def column_checked(self, cases) -> bool:
        return (0 in cases and 3 in cases and 6 in cases) or \
               (1 in cases and 4 in cases and 7 in cases) or \
               (2 in cases and 5 in cases and 8 in cases)

    def diagonal_checked(self, cases) -> bool:
        return (0 in cases and 4 in cases and 8 in cases) or \
               (2 in cases and 4 in cases and 6 in cases)

    def tictactoe_ended(self, player_indice) -> bool:
        player_indice_cases = []
        for i, case in enumerate(self.cases):
            if case == player_indice:
                player_indice_cases.append(i)

        if self.line_checked(player_indice_cases) or self.column_checked(player_indice_cases) or self.diagonal_checked(player_indice_cases):
            return True
        return False


# Monte Carlo 

## Monte Carlo Es

In [12]:
def algo_monte_carlo_es(env) -> PolicyAndActionValueFunction:
    max_episodes_count = 10000
    gamma = 0.85

    pi = {}
    q = {}
    returns = {}

    for ep in tqdm(range(max_episodes_count)):
        env.reset()
        S = []
        A = []
        R = []


        while not env.is_game_over():
            s = env.state_id()
            S.append(s)
            available_actions = env.available_actions_ids()

            if s not in pi:
                pi[s] = {}
                q[s] = {}
                returns[s] = {}
                for a in available_actions:
                    pi[s][a] = 1.0 / len(available_actions)
                    q[s][a] = 0.0
                    returns[s][a] = []
            chosen_action = available_actions[np.random.randint(len(available_actions))]
            A.append(chosen_action)

            old_score = env.score()
            env.act_with_action_id(chosen_action)
            r = env.score() - old_score
            R.append(r)

            G = 0
            for t in reversed(range(len(S))):
                G = gamma * G + R[t]

                found = False
                for prev_s, prev_a in zip(S[:t], A[:t]):
                    if prev_s == S[t] and prev_a == A[t]:
                        found = True
                        break
                if found:
                    continue

                if A[t] not in returns[S[t]]:
                    returns[S[t]][A[t]] = []

                returns[S[t]][A[t]].append(G)
                q[S[t]][A[t]] = np.mean(returns[S[t]][A[t]])
                pi[S[t]] = list(q[S[t]].keys())[np.argmax(list(q[S[t]].values()))]

                #max = max_dict(q[s])
                #pi[s][max[0]] = max[1]

                #optimal_a_t = list(q[S[t]].keys())[np.argmax(list(q[S[t]].values()))]
                # pi[S[t]][optimal_a_t] = np.argmax(q[S[t]][optimal_a_t])
                #for a_key in pi[S[t]].keys():
                    # pi[S[t]][a_key] = np.argmax(q[S[t]][a_key])
                    # pi[S[t]][a_key] = np.argmax(q[S[t]][optimal_a_t])

    #for s in pi.keys():
    #    probabilities = np.array(list(pi[s].values()))
    #    probabilities /= probabilities.sum()
    #    for i in range(len(probabilities)):
    #        pi[s][i] = probabilities[i]

    return PolicyAndActionValueFunction(pi, q)


## Monte Carlo On Policy

In [13]:
def algo_on_policy_monte_carlo(env) -> PolicyAndActionValueFunction:
    epsilon = 0.1
    max_episodes_count = 10000
    gamma = 0.9

    pi = {}
    q = {}
    returns = {}

    for it in tqdm(range(max_episodes_count)):
        env.reset()
        S = []
        A = []
        R = []
        while not env.is_game_over():
            s = env.state_id()
            S.append(s)
            available_actions = env.available_actions_ids()

            if s not in pi:
                pi[s] = {}
                q[s] = {}
                returns[s] = {}
                for a in available_actions:
                    pi[s][a] = 1.0 / len(available_actions)
                    q[s][a] = 0.0
                    returns[s][a] = []

            chosen_action = np.random.choice(
                list(pi[s].keys()),
                1,
                False,
                p=list(pi[s].values())
            )[0]
            A.append(chosen_action)
            old_score = env.score()
            env.act_with_action_id(chosen_action)
            r = env.score() - old_score
            R.append(r)

            G = 0

            for t in reversed(range(len(S))):
                G = gamma * G + R[t]
                s_t = S[t]
                a_t = A[t]
                found = False
                for p_s, p_a in zip(S[:t], A[:t]):
                    if s_t == p_s and a_t == p_a:
                        found = True
                        break
                if found:
                    continue

                if a_t not in returns[s_t]:
                    returns[s_t][a_t] = []

                returns[s_t][a_t].append(G)
                q[s_t][a_t] = np.mean(returns[s_t][a_t])
                optimal_a_t = list(q[s_t].keys())[np.argmax(list(q[s_t].values()))]
                available_actions_t_count = len(q[s_t])
                for a_key, q_s_a in q[s_t].items():
                    if a_key == optimal_a_t:
                        pi[s_t][a_key] = 1 - epsilon + epsilon / available_actions_t_count
                    else:
                        pi[s_t][a_key] = epsilon / available_actions_t_count

    return PolicyAndActionValueFunction(pi, q)



## Monte Carlo Off Policy

In [14]:
def algo_off_policy_monte_carlo(env) -> PolicyAndActionValueFunction:
    max_episodes_count = 10000
    gamma = 0.90

    Q = {}
    C = {}
    pi = {}

    for it in tqdm(range(max_episodes_count)):
        env.reset()
        S = []
        A = []
        R = []
        while not env.is_game_over():
            s = env.state_id()
            S.append(s)
            available_actions = env.available_actions_ids()

            if s not in pi:
                pi[s] = {}
                Q[s] = {}
                C[s] = {}
                for a in available_actions:
                    pi[s][a] = 1.0 / len(available_actions)
                    Q[s][a] = 0.0
                    C[s][a] = 0.0

            chosen_action = available_actions[np.random.randint(len(available_actions))]

            A.append(chosen_action)
            old_score = env.score()
            env.act_with_action_id(chosen_action)
            r = env.score() - old_score
            R.append(r)

            G = 0
            W = 1

            for t in reversed(range(len(S))):
                G = gamma * G + R[t]

                s_t = S[t]
                a_t = A[t]

                if a_t not in C[s_t]:
                    C[s_t][a_t] = 0.0

                if a_t not in Q[s_t]:
                    Q[s_t][a_t] = 0.0

                C[s_t][a_t] += W
                Q[s_t][a_t] += (W / (C[s_t][a_t])) * (G - Q[s_t][a_t])

                max = max_dict(Q[s])
                pi[s][max[0]] = max[1]
                # for a_key in pi[s_t].keys():
                #    pi[s_t][a_key] = np.argmax(Q[s_t][a_key])

                optimal_a_t = list(Q[s_t].keys())[np.argmax(list(Q[s_t].values()))]
                if chosen_action != optimal_a_t:
                    break

                W *= 1. / (available_actions[np.random.randint(len(available_actions))] + 1)

    for s in pi.keys():
        probabilities = np.array(list(pi[s].values()))
        probabilities /= probabilities.sum()
        for i in range(len(probabilities)):
            pi[s][i] = probabilities[i]

    return PolicyAndActionValueFunction(pi, Q)


# Résultats

## Result Monte Carlo ES

<img src="img/monte_carlo_es.png"> </img>

## Monte Carlo On Policy

<img src="img/on_policy_monte_carlo.png"> </img>

## Monte Carlo Off Policy

<img src="img/off_policy_monte_carlo.png"> </img>

# Secret Envs

## Monte Carlo ES

In [20]:
PolicyAndActionValueFunction(pi={0: 1, 54: 0, 93: 2, 114: 0, 9: 2, 63: 2, 163: 0, 39: 1, 123: 1, 109: 1, 118: 1, 19: 2, 58: 2, 153: 1, 133: 0, 128: 0}, q={0: {0: 0.1425663223383026, 1: 0.1461294385796545, 2: 0.14421583087512288}, 54: {0: 0.3445226917057903, 2: 0.2059401309635173}, 93: {2: 1.0}, 114: {0: 0.34268292682926826, 1: 0.2028873917228104}, 9: {1: 0.22093862815884477, 2: 0.33999999999999997}, 63: {2: 1.0}, 163: {0: 1.0}, 39: {1: 0.3344262295081967, 2: 0.22190265486725663}, 123: {1: 1.0}, 109: {0: 0.2084628670120898, 1: 0.3394988946204864}, 118: {1: 1.0}, 19: {0: 0.2125, 2: 0.33597773138482945}, 58: {2: 1.0}, 153: {1: 1.0}, 133: {0: 1.0}, 128: {0: 1.0}})

PolicyAndActionValueFunction(pi={0: {0: 0.32701129958531144, 1: 0.354282094842198, 2: 0.3187066055724906}, 9: {1: 1.0, 2: 1.4136344437549157e-10, 0: 0.0}, 109: {0: 0.42769230770194316, 1: 0.5723076922980568}, 128: {0: 1.0}, 39: {1: 0.9999999997225122, 2: 0.5, 0: 2.774878501529102e-10}, 153: {1: 1.0, 0: 1.0}, 19: {0: 0.3687418371943629, 2: 1.4349184880713655e-10, 1: 0.6312581628056371}, 54: {0: 0.6688128772460566, 2: 6.984919307447685e-11, 1: 0.3311871227539433}, 114: {0: 2.7855420050914686e-10, 1: 0.9999999997214457}, 63: {2: 1.0, 0: 1.0}, 118: {1: 1.0, 0: 1.0}, 93: {2: 1.0, 0: 1.0}, 133: {0: 1.0}, 163: {0: 1.0}, 58: {2: 1.0, 0: 1.0}, 123: {1: 1.0, 0: 1.0}}, q={0: {0: 1.189118460482065e-20, 1: 1.1445929154840554e-20, 2: 1.145986914951939e-20}, 9: {1: 7.273131997294813e-11, 2: 1.4161320311109802e-10}, 109: {0: 7.018826682723907e-11, 1: 1.4020089831838606e-10}, 128: {0: 1.0}, 39: {1: 1.3874392511495485e-10, 2: 6.933559606661577e-11}, 153: {1: 1.0}, 19: {0: 7.310936544641636e-11, 2: 1.437

## Monte Carlo On Policy

In [None]:
PolicyAndActionValueFunction(pi={0: {0: 0.03333333333333333, 1: 0.9333333333333333, 2: 0.03333333333333333}, 109: {0: 0.05, 1: 0.9500000000000001}, 19: {0: 0.05, 2: 0.9500000000000001}, 128: {0: 1.0}, 58: {2: 1.0}, 39: {1: 0.9500000000000001, 2: 0.05}, 54: {0: 0.9500000000000001, 2: 0.05}, 163: {0: 1.0}, 153: {1: 1.0}, 114: {0: 0.9500000000000001, 1: 0.05}, 133: {0: 1.0}, 9: {1: 0.05, 2: 0.9500000000000001}, 63: {2: 1.0}, 123: {1: 1.0}, 118: {1: 1.0}, 93: {2: 1.0}}, q={0: {0: 0.19433781190019195, 1: 0.200304833319749, 2: 0.1956320657759507}, 109: {0: 0.23333333333333336, 1: 0.35932835820895526}, 19: {0: 0.225, 2: 0.36899999999999994}, 128: {0: 1.0}, 58: {2: 1.0}, 39: {1: 0.3642140468227425, 2: 0.15}, 54: {0: 0.35106382978723405, 2: 0.16363636363636364}, 163: {0: 1.0}, 153: {1: 1.0}, 114: {0: 0.3602469135802469, 1: 0.2189910979228487}, 133: {0: 1.0}, 9: {1: 0.231864406779661, 2: 0.3648286140089419}, 63: {2: 1.0}, 123: {1: 1.0}, 118: {1: 1.0}, 93: {2: 1.0}})

## Monte Carlo Off Policy

In [None]:
PolicyAndActionValueFunction(pi={0: {0: 0.32701129958531144, 1: 0.354282094842198, 2: 0.3187066055724906}, 9: {1: 1.0, 2: 1.4136344437549157e-10, 0: 0.0}, 109: {0: 0.42769230770194316, 1: 0.5723076922980568}, 128: {0: 1.0}, 39: {1: 0.9999999997225122, 2: 0.5, 0: 2.774878501529102e-10}, 153: {1: 1.0, 0: 1.0}, 19: {0: 0.3687418371943629, 2: 1.4349184880713655e-10, 1: 0.6312581628056371}, 54: {0: 0.6688128772460566, 2: 6.984919307447685e-11, 1: 0.3311871227539433}, 114: {0: 2.7855420050914686e-10, 1: 0.9999999997214457}, 63: {2: 1.0, 0: 1.0}, 118: {1: 1.0, 0: 1.0}, 93: {2: 1.0, 0: 1.0}, 133: {0: 1.0}, 163: {0: 1.0}, 58: {2: 1.0, 0: 1.0}, 123: {1: 1.0, 0: 1.0}}, q={0: {0: 1.189118460482065e-20, 1: 1.1445929154840554e-20, 2: 1.145986914951939e-20}, 9: {1: 7.273131997294813e-11, 2: 1.4161320311109802e-10}, 109: {0: 7.018826682723907e-11, 1: 1.4020089831838606e-10}, 128: {0: 1.0}, 39: {1: 1.3874392511495485e-10, 2: 6.933559606661577e-11}, 153: {1: 1.0}, 19: {0: 7.310936544641636e-11, 2: 1.4373924854641968e-10}, 54: {0: 1.4105632913802645e-10, 2: 7.581496045400076e-11}, 114: {0: 1.3927710029336967e-10, 1: 7.245125847401517e-11}, 63: {2: 1.0}, 118: {1: 1.0}, 93: {2: 1.0}, 133: {0: 1.0}, 163: {0: 1.0}, 58: {2: 1.0}, 123: {1: 1.0}})

# Temporal difference

In [21]:
def get_epsilon_best_action(epsilon, available_actions, Q, s):
    available_actions_len = len(available_actions)
    if available_actions_len == 1:
        return available_actions[0]
    elif available_actions_len == 0:
        action_values = list(Q[s].values())
        if len(action_values) > 0:
            best_action_value = np.sort(action_values)[len(action_values)-1]
            best_action = list(Q[s].keys())[list(Q[s].values()).index(best_action_value)]
            return best_action
        else:
            return np.random.randint(8)

    if np.random.uniform(0, 1) > epsilon:
        return available_actions[np.random.randint(available_actions_len)]
    else:
        for i in range(len(list(Q[s].keys())) - 1, 0, -1):
            best_action_value = np.sort(list(Q[s].values()))[i]
            best_action = list(Q[s].keys())[list(Q[s].values()).index(best_action_value)]
            if best_action in available_actions:
                return best_action
        return available_actions[np.random.randint(available_actions_len)]



## Q-Learning

In [22]:

def algo_q_learning(env) -> PolicyAndActionValueFunction:
    alpha = 0.1
    epsilon = 1.0
    gamma = 0.9
    max_iter = 10000

    pi = {}  # learned greedy policy
    b = {}  # behaviour epsilon-greedy policy
    q = {}  # action-value function of pi

    for it in tqdm(range(max_iter)):
        env.reset()

        while not env.is_game_over():
            s = env.state_id()
            available_actions = env.available_actions_ids()
            if s not in pi:
                pi[s] = {}
                q[s] = {}
                b[s] = {}
                for a in available_actions:
                    pi[s][a] = 1.0 / len(available_actions)
                    q[s][a] = 0.0
                    b[s][a] = 1.0 / len(available_actions)

            # actions disponibles differents selon les states
            available_actions_count = len(available_actions)
            optimal_a = list(q[s].keys())[np.argmax(list(q[s].values()))]
            for a_key, q_s_a in q[s].items():
                if a_key == optimal_a:
                    b[s][a_key] = 1 - epsilon + epsilon / available_actions_count
                else:
                    b[s][a_key] = epsilon / available_actions_count

            chosen_action = np.random.choice(
                list(b[s].keys()),
                1,
                False,
                p=list(b[s].values())
            )[0]
            old_score = env.score()
            env.act_with_action_id(chosen_action)
            r = env.score() - old_score
            s_p = env.state_id()
            next_available_actions = env.available_actions_ids()

            if env.is_game_over():
                q[s][chosen_action] += alpha * (r + 0.0 - q[s][chosen_action])
            else:
                if s_p not in pi:
                    pi[s_p] = {}
                    q[s_p] = {}
                    b[s_p] = {}
                    for a in next_available_actions:
                        pi[s_p][a] = 1.0 / len(next_available_actions)
                        q[s_p][a] = 0.0
                        b[s_p][a] = 1.0 / len(next_available_actions)
                q[s][chosen_action] += alpha * (r + gamma * np.max(list(q[s_p].values())) - q[s][chosen_action])

    for s in q.keys():
        optimal_a = list(q[s].keys())[np.argmax(list(q[s].values()))]
        for a_key, q_s_a in q[s].items():
            if a_key == optimal_a:
                pi[s][a_key] = 1.0
            else:
                pi[s][a_key] = 0.0

    return PolicyAndActionValueFunction(pi, q)



## Sarsa

In [23]:
def algo_sarsa(env) -> PolicyAndActionValueFunction:
    max_episodes_count = 10000
    alpha = 0.85
    gamma = 0.95
    epsilon = 0.1

    Q = {}
    pi = {}

    for ep in tqdm(range(max_episodes_count)):

        env.reset()
        S = []
        A = []
        R = []

        s_1 = env.state_id()
        available_actions = env.available_actions_ids()
        if s_1 not in Q:
            pi[s_1] = {}
            Q[s_1] = {}
            for a in available_actions:
                pi[s_1][a] = 1.0 / len(available_actions)
                Q[s_1][a] = 0.0
        action_1 = get_epsilon_best_action(epsilon, available_actions, Q, s_1)

        while not env.is_game_over():
            S.append(s_1)
            available_actions = env.available_actions_ids()

            if s_1 not in Q:
                pi[s_1] = {}
                Q[s_1] = {}
                for a in available_actions:
                    pi[s_1][a] = 1.0 / len(available_actions)
                    Q[s_1][a] = 0.0

            A.append(action_1)

            old_score = env.score()
            env.act_with_action_id(action_1)
            r = env.score() - old_score
            R.append(r)

            s_2 = env.state_id()
            available_actions = env.available_actions_ids()

            if s_2 not in Q:
                Q[s_2] = {}
                pi[s_2] = {}
                for a in available_actions:
                    Q[s_2][a] = 0.0
                    pi[s_2][a] = 1.0 / len(available_actions)

            action_2 = get_epsilon_best_action(epsilon, available_actions, Q, s_2)

            if action_2 not in Q[s_2]:
                Q[s_2][action_2] = 0.0

            target = r + gamma * Q[s_2][action_2]
            Q[s_1][action_1] += alpha * (target - Q[s_1][action_1])

            #for a_key in pi[s_1].keys():
            #    max = np.argmax(Q[s_1][a_key])
            #    pi[s_1][a_key] = max
            s_1 = s_2
            action_1 = action_2

    for s in Q.keys():
        max = max_dict(Q[s])
        pi[s][max[0]] = max[1]
        probabilities = np.array(list(pi[s].values()))
        probabilities /= probabilities.sum()
        for i in range(len(probabilities)):
            pi[s][i] = probabilities[i]

    return PolicyAndActionValueFunction(pi, Q)


## Expected Sarsa

In [24]:
def algo_expected_sarsa(env) -> PolicyAndActionValueFunction:
    alpha = 0.1
    epsilon = 1.0
    gamma = 0.9
    max_iter = 10000

    pi = {}  # learned greedy policy
    b = {}  # behaviour epsilon-greedy policy
    q = {}  # action-value function of pi

    for it in tqdm(range(max_iter)):
        env.reset()

        while not env.is_game_over():
            s = env.state_id()
            available_actions = env.available_actions_ids()
            if s not in pi:
                pi[s] = {}
                q[s] = {}
                b[s] = {}
                for a in available_actions:
                    pi[s][a] = 1.0 / len(available_actions)
                    q[s][a] = 0.0
                    b[s][a] = 1.0 / len(available_actions)

            # actions disponibles differents selon les states
            available_actions_count = len(available_actions)
            optimal_a = list(q[s].keys())[np.argmax(list(q[s].values()))]
            for a_key, q_s_a in q[s].items():
                if a_key == optimal_a:
                    b[s][a_key] = 1 - epsilon + epsilon / available_actions_count
                else:
                    b[s][a_key] = epsilon / available_actions_count

            chosen_action = np.random.choice(
                list(b[s].keys()),
                1,
                False,
                p=list(b[s].values())
            )[0]
            old_score = env.score()
            env.act_with_action_id(chosen_action)
            r = env.score() - old_score
            s_p = env.state_id()
            next_available_actions = env.available_actions_ids()

            if env.is_game_over():
                q[s][chosen_action] += alpha * (r + 0.0 - q[s][chosen_action])
            else:
                if s_p not in pi:
                    pi[s_p] = {}
                    q[s_p] = {}
                    b[s_p] = {}
                    for a in next_available_actions:
                        pi[s_p][a] = 1.0 / len(next_available_actions)
                        q[s_p][a] = 0.0
                        b[s_p][a] = 1.0 / len(next_available_actions)
                sum = 0
                for a in pi[s_p]:
                    sum += pi[s_p][a] * q[s_p][a]
                q[s][chosen_action] += alpha * (r + gamma * sum - q[s][chosen_action])

    for s in q.keys():
        optimal_a = list(q[s].keys())[np.argmax(list(q[s].values()))]
        for a_key, q_s_a in q[s].items():
            if a_key == optimal_a:
                pi[s][a_key] = 1.0
            else:
                pi[s][a_key] = 0.0

    return PolicyAndActionValueFunction(pi, q)



# Result for Q Learning

<img src="img/q_learning.png"> </img>

# Result for Sarsa

<img src="img/sarsa.png"> </img>

# Result for expected Sarsa

<img src="img/ex_sarsa.png"> </img>

# Deep Learning Environments

## TicTacToe Environment

In [11]:
def init_tic_tac_toe_dict():
    dict = {}
    all_possible_states = 9
    for s in range(all_possible_states):
        dict[s] = {}
        for a in range(all_possible_states):
            dict[s][a] = 0
    return dict


class TicTacToe(DeepSingleAgentWithDiscreteActionsEnv):
    def __init__(self):
        self.cases = [-1] * 9
        self.game_state = 0
        self.game_over = False
        self.player_turn = True
        self.player_value = 1
        self.random_player_value = 0
        self.current_score = 0.0
        self.reset()

    def state_id(self) -> int:
        sum = 0
        available_actions_size = 2
        for i in range(len(self.cases)):
            case = self.cases[i]
            if case == self.player_value:
                sum += pow(available_actions_size, i)
            elif case == self.random_player_value:
                sum += pow(available_actions_size, len(self.cases) + i)
        return sum

    def state_description(self) -> int:
        return self.cases

    def state_description_length(self) -> int:
        return 9

    def max_actions_count(self) -> int:
        return 9

    def is_game_over(self) -> bool:
        return self.game_over

    def act_with_action_id(self, action_id: int):
        if self.cases[action_id] != -1:
            print(self.cases)
            print(action_id)
            print(self.available_actions_ids())
        assert (action_id < len(self.cases))
        assert (self.cases[action_id] == -1)
        assert (not self.game_over)

        if self.player_turn:
            self.cases[action_id] = self.player_value
        else:
            self.cases[action_id] = self.random_player_value

        self.player_turn = not self.player_turn
        self.game_state = self.state_id()

        if self.tictactoe_ended(self.player_value):
            self.game_over = True
            self.current_score = 1.0
        elif self.tictactoe_ended(self.random_player_value):
            self.game_over = True
            self.current_score = -1.0
        elif -1 not in self.cases:
            self.game_over = True
            self.current_score = 0.0
        elif not self.player_turn:
            rand = random.randint(0, 8)
            while self.cases[rand] != -1:
                rand = random.randint(0, 8)
            self.act_with_action_id(rand)

    def score(self) -> float:
        return self.current_score

    def available_actions_ids(self) -> np.ndarray:
        if self.game_over:
            return np.array([], dtype=np.int)
        available_actions = []
        for i in range(len(self.cases)):
            if self.cases[i] == -1:
                available_actions.append(i)
        return np.array(available_actions)

    def reset(self):
        self.game_over = False
        self.current_score = 0.0
        self.game_state = 0
        self.player_turn = True
        self.cases = [-1] * 9

    def line_checked(self, cases) -> bool:
        return (0 in cases and 1 in cases and 2 in cases) or \
               (3 in cases and 4 in cases and 5 in cases) or \
               (6 in cases and 7 in cases and 8 in cases)

    def column_checked(self, cases) -> bool:
        return (0 in cases and 3 in cases and 6 in cases) or \
               (1 in cases and 4 in cases and 7 in cases) or \
               (2 in cases and 5 in cases and 8 in cases)

    def diagonal_checked(self, cases) -> bool:
        return (0 in cases and 4 in cases and 8 in cases) or \
               (2 in cases and 4 in cases and 6 in cases)

    def tictactoe_ended(self, player_indice) -> bool:
        player_indice_cases = []
        for i, case in enumerate(self.cases):
            if case == player_indice:
                player_indice_cases.append(i)

        if self.line_checked(player_indice_cases) or self.column_checked(player_indice_cases) or self.diagonal_checked(player_indice_cases):
            return True
        return False


## Pacman Environment

In [None]:
def add_wall(cases, start_x, start_y, x, y):
    for i in range(start_x, start_x + x):
        for j in range(start_y, start_y + y):
            cases[i][j] = -1
    return cases


# -1 mur / 0 vide / 1 dot / 2 mega dot
def initiate_map():
    cases = []
    for line in range(29):
        cases.append([1] * 26)

    cases[2][0] = 2
    cases[22][0] = 2
    cases[2][25] = 2
    cases[22][25] = 2

    for i in range(4):
        cases[i][12] = -1
        cases[i][13] = -1

    for i in range(8, 19):
        for j in range(6, 20):
            cases[i][j] = 0

    for i in range(0, 5):
        cases[13][i] = 0
    for i in range(21, 26):
        cases[13][i] = 0

    cases = add_wall(cases, 1, 1, 3, 4)
    cases = add_wall(cases, 1, 21, 3, 4)
    cases = add_wall(cases, 1, 6, 3, 5)
    cases = add_wall(cases, 1, 15, 3, 5)
    cases = add_wall(cases, 5, 1, 2, 4)
    cases = add_wall(cases, 5, 21, 2, 4)
    cases = add_wall(cases, 11, 9, 5, 8)

    cases = add_wall(cases, 8, 0, 5, 5)
    cases = add_wall(cases, 14, 0, 5, 5)
    cases = add_wall(cases, 8, 21, 5, 5)
    cases = add_wall(cases, 14, 21, 5, 5)

    cases = add_wall(cases, 5, 6, 8, 2)
    cases = add_wall(cases, 8, 7, 2, 4)
    cases = add_wall(cases, 5, 18, 8, 2)
    cases = add_wall(cases, 8, 15, 2, 4)

    cases = add_wall(cases, 14, 6, 5, 2)
    cases = add_wall(cases, 14, 18, 5, 2)

    cases = add_wall(cases, 5, 9, 2, 8)
    cases = add_wall(cases, 7, 12, 3, 2)
    cases = add_wall(cases, 17, 9, 2, 8)
    cases = add_wall(cases, 19, 12, 3, 2)
    cases = add_wall(cases, 23, 9, 2, 8)
    cases = add_wall(cases, 25, 12, 3, 2)

    cases = add_wall(cases, 20, 15, 2, 5)
    cases = add_wall(cases, 20, 6, 2, 5)

    cases = add_wall(cases, 20, 1, 2, 4)
    cases = add_wall(cases, 22, 3, 3, 2)
    cases = add_wall(cases, 20, 21, 2, 4)
    cases = add_wall(cases, 22, 21, 3, 2)

    cases = add_wall(cases, 23, 0, 2, 2)
    cases = add_wall(cases, 23, 24, 2, 2)

    cases = add_wall(cases, 26, 1, 2, 10)
    cases = add_wall(cases, 23, 6, 3, 2)
    cases = add_wall(cases, 26, 15, 2, 10)
    cases = add_wall(cases, 23, 18, 3, 2)

    return cases

In [None]:
class PacMan(DeepSingleAgentWithDiscreteActionsEnv):
    def __init__(self):
        pygame.init()
        self.cases = initiate_map()
        self.game_over = False
        self.current_score = 0.0
        self.round_counter = 0
        self.move_time = TimeCapsule(0.3)
        self.pacman_position = {'x': 13, 'y': 22}
        self.ghost_spawn_position = [
            [13, 10],
            [8, 14],
            [17, 14],
            [17, 10]
        ]
        self.ghosts = [
            {'x': self.ghost_spawn_position[i][0], 'y': self.ghost_spawn_position[i][1], 'dead': False, 'time_to_respawn': TimeCapsule(3)} for i in range(4)
        ]
        self.take_energizer = False
        self.energizer_time = TimeCapsule(5.0)
        self.reset()

    def state_description(self) -> np.ndarray:
        complete_cases = self.get_complete_cases()
        return np.hstack(complete_cases)

    def get_complete_cases(self):
        cases_copy = np.array(self.cases)
        for i in range(4):
            cases_copy[self.ghosts[i]['y'], self.ghosts[i]['x']] = i + 4
        cases_copy[self.pacman_position['y']][self.pacman_position['x']] = 3
        return cases_copy

    def state_description_length(self) -> int:
        return 29 * 26

    def max_actions_count(self) -> int:
        return 4

    def is_game_over(self) -> bool:
        return self.game_over

    def act_with_action_id(self, action_id: int):
        assert (3 >= action_id >= 0)
        assert (not self.game_over)

        if self.take_energizer and self.energizer_time.can_execute():
            self.take_energizer = False

        x = self.pacman_position['x']
        y = self.pacman_position['y']

        if action_id == 0:
            x -= 1
        elif action_id == 1:
            x += 1
        elif action_id == 2:
            y -= 1
        elif action_id == 3:
            y += 1

        self.move_ghosts()
        if 0 <= x < 26 and 0 <= y < 29:
            if self.cases[y][x] != -1:
                self.pacman_position['x'] = x
                self.pacman_position['y'] = y

                case_value = self.cases[y][x]
                if case_value == 1:
                    self.current_score += 100
                    self.cases[y][x] = 0
                elif case_value == 2:
                    self.cases[y][x] = 0
                    self.take_energizer = True
                    self.energizer_time.restart()

        if self.check_game_ended():
            self.current_score += 10000
            self.game_over = True

        #self.current_score -= 5
        self.round_counter += 1

    def check_game_ended(self):
        dot_counter = 0
        for i in range(len(self.cases)):
            for j in range(len(self.cases[i])):
                if self.cases[i][j] == 1:
                    dot_counter += 1

        if self.round_counter % 10 == 0:
            print(f"Dots left : {dot_counter}")
            print(f"Score : {self.current_score}")
        return dot_counter == 0

    def move_ghosts(self):

        if self.game_over:
            return

        for i in range(4):

            if self.ghosts[i]['dead']:
                if self.ghosts[i]['time_to_respawn'].can_execute():
                    self.ghosts[i]['dead'] = False
                    self.ghosts[i]['x'] = 13
                    self.ghosts[i]['y'] = 10
                continue

            ghost_available_action = self.get_available_actions(i)
            random_action = ghost_available_action[np.random.randint(len(ghost_available_action))]

            new_x = self.ghosts[i]['x']
            new_y = self.ghosts[i]['y']

            if random_action == 0:
                new_x -= 1
            elif random_action == 1:
                new_x += 1
            elif random_action == 2:
                new_y -= 1
            elif random_action == 3:
                new_y += 1

            if 0 <= new_x < 26 and 0 <= new_y < 29:
                if self.pacman_position['x'] == new_x and self.pacman_position['y'] == new_y:
                    if self.take_energizer:
                        self.current_score += 10000
                        self.ghosts[i] = {'x': 13, 'y': 4, 'dead': True, 'time_to_respawn': TimeCapsule(3)}
                    else:
                        self.current_score -= 0 # 100000
                        self.game_over = True
                        return
                elif self.cases[new_y][new_x] != -1:
                    can_swap_places = True
                    for j in range(4):
                        if i != j:
                            can_swap_places = self.ghosts[j]['x'] != new_x or self.ghosts[j]['y'] != new_y
                            if not can_swap_places:
                                continue

                    if can_swap_places:
                        self.ghosts[i]['x'] = new_x
                        self.ghosts[i]['y'] = new_y

    def score(self) -> float:
        return self.current_score

    def get_available_actions(self, ghost_id):
        if self.game_over:
            return np.array([], dtype=np.int)

        available_actions = []
        if ghost_id is not None:
            pos_x, pos_y = self.ghosts[ghost_id]['x'], self.ghosts[ghost_id]['y']
        else:
            pos_x, pos_y = self.pacman_position['x'], self.pacman_position['y']

        if 0 <= (pos_x - 1) < 26 and self.cases[pos_y][pos_x - 1] != -1:
            available_actions.append(0)
        if 0 <= (pos_x + 1) < 26 and self.cases[pos_y][pos_x + 1] != -1:
            available_actions.append(1)
        if 0 <= pos_y - 1 < 29 and self.cases[pos_y - 1][pos_x] != -1:
            available_actions.append(2)
        if 0 <= pos_y + 1 < 29 and self.cases[pos_y + 1][pos_x] != -1:
            available_actions.append(3)

        return np.array(available_actions, dtype=np.int)

    def available_actions_ids(self) -> np.ndarray:
        return self.get_available_actions(None)

    def reset(self):
        pygame.init()
        self.cases = initiate_map()
        self.game_state = 0
        self.game_over = False
        self.current_score = 0.0
        self.move_time = TimeCapsule(0.3)
        self.pacman_position = {'x': 13, 'y': 22}
        self.ghost_spawn_position = [
            [13, 10],
            [8, 14],
            [17, 14],
            [17, 10]
        ]
        self.ghosts = [
            {'x': self.ghost_spawn_position[i][0], 'y': self.ghost_spawn_position[i][1], 'dead': False,
             'time_to_respawn': TimeCapsule(3)} for i in range(4)
        ]
        self.take_energizer = False


# Deep Reinforcement Learning

## Semi Gradient Sarsa

In [9]:
def episodic_semi_gradient_sarsa(env: DeepSingleAgentWithDiscreteActionsEnv):
    epsilon = 0.1
    gamma = 0.9
    max_episodes_count = 100 if not isinstance(env, PacMan) else 10
    pre_warm = (max_episodes_count / 10) if not isinstance(env, PacMan) else 3

    state_description_length = env.state_description_length()
    max_actions_count = env.max_actions_count()

    q = tf.keras.Sequential([
        tf.keras.layers.Dense(16, activation=tf.keras.activations.tanh,
                              input_dim=(state_description_length + max_actions_count)),
        tf.keras.layers.Dense(1, activation=tf.keras.activations.linear),
    ])

    q.compile(optimizer=tf.keras.optimizers.Adam(), loss=tf.keras.losses.mse)

    for episode_id in tqdm.tqdm(range(max_episodes_count)):
        env.reset()
        round_counter = 0

        while not env.is_game_over():
            round_counter += 1
            s = env.state_description()
            available_actions = env.available_actions_ids()

            if (episode_id < pre_warm) or np.random.uniform(0.0, 1.0) < epsilon:
                chosen_action = np.random.choice(available_actions)
            else:
                all_q_inputs = np.zeros((len(available_actions), state_description_length + max_actions_count))
                for i, a in enumerate(available_actions):
                    all_q_inputs[i] = np.hstack([s, tf.keras.utils.to_categorical(a, max_actions_count)])

                all_q_values = np.squeeze(q.predict(all_q_inputs))
                chosen_action = available_actions[np.argmax(all_q_values)]

            previous_score = env.score()
            env.act_with_action_id(chosen_action)
            r = env.score() - previous_score
            s_p = env.state_description()

            if env.is_game_over():
                target = r
                q_inputs = np.hstack([s, tf.keras.utils.to_categorical(chosen_action, max_actions_count)])
                q.train_on_batch(np.array([q_inputs]), np.array([target]))
                break

            next_available_actions = env.available_actions_ids()

            if episode_id < pre_warm or np.random.uniform(0.0, 1.0) < epsilon:
                next_chosen_action = np.random.choice(next_available_actions)
            else:
                next_chosen_action = None
                next_chosen_action_q_value = None
                for a in next_available_actions:
                    q_inputs = np.hstack([s_p, tf.keras.utils.to_categorical(a, max_actions_count)])
                    q_value = q.predict(np.array([q_inputs]))[0][0]
                    if next_chosen_action is None or next_chosen_action_q_value < q_value:
                        next_chosen_action = a
                        next_chosen_action_q_value = q_value

            next_q_inputs = np.hstack([s_p, tf.keras.utils.to_categorical(next_chosen_action, max_actions_count)])
            next_chosen_action_q_value = q.predict(np.array([next_q_inputs]))[0][0]

            target = r + gamma * next_chosen_action_q_value

            q_inputs = np.hstack([s, tf.keras.utils.to_categorical(chosen_action, max_actions_count)])
            q.train_on_batch(np.array([q_inputs]), np.array([target]))

    return q


## Deep Q Learning

In [10]:
def deep_q_learning(env: DeepSingleAgentWithDiscreteActionsEnv):
    epsilon = 0.1
    gamma = 0.95
    eps_decay = 0.995
    eps_min = 0.01
    max_episodes_count = 100

    state_description_length = env.state_description_length()
    max_actions_count = env.max_actions_count()

    model = tf.keras.Sequential([
        tf.keras.layers.Dense(32, activation=tf.keras.activations.relu,
                              input_dim=(state_description_length + max_actions_count)),
        tf.keras.layers.Dense(16, activation=tf.keras.activations.relu),
        tf.keras.layers.Dense(1, activation=tf.keras.activations.linear),
    ])
    target_model = model

    model.compile(optimizer=tf.keras.optimizers.Adam(), loss=tf.keras.losses.mse)
    target_model.compile(optimizer=tf.keras.optimizers.Adam(), loss=tf.keras.losses.mse)

    target_model.set_weights(model.get_weights())

    buffer = ReplayBuffer()

    for episode_id in tqdm.tqdm(range(max_episodes_count)):
        done, total_reward = False, 0
        s = env.state_description()
        env.reset()

        while not env.is_game_over():
            available_actions = env.available_actions_ids()

            # get model action
            epsilon *= eps_decay
            epsilon = max(epsilon, eps_min)
            if np.random.random() < epsilon:
                action = np.random.choice(available_actions)
            else:
                all_q_inputs = get_q_inputs(available_actions, s, state_description_length, max_actions_count)
                all_q_values = np.squeeze(model.predict(all_q_inputs))
                action = available_actions[np.argmax(all_q_values)]

            # step env
            previous_score = env.score()
            env.act_with_action_id(action)
            r = env.score() - previous_score
            next_available_action = env.available_actions_ids()

            next_s = env.state_description()
            buffer.put(s, action, available_actions, r * 0.01, next_s, next_available_action, env.is_game_over())
            total_reward += r
            s = next_s

            if buffer.size() >= batch_size:
                for _ in range(10):
                    states, actions, available_actions, rewards, next_states, next_available_actions, done = buffer.sample()
                    targets = []
                    next_q_values = []
                    for x in range(len(states)):
                        all_q_inputs = get_q_inputs(available_actions[x], states[x], state_description_length,
                                                       max_actions_count)
                        targets.append(target_model.predict(all_q_inputs))

                        next_all_q_inputs = get_q_inputs(next_available_actions[x], next_states[x], state_description_length,
                                                        max_actions_count)
                        if len(next_all_q_inputs) == 0:
                            next_q_values.append([0])
                        else:
                            next_q_values.append(target_model.predict(next_all_q_inputs).max(axis=1))

                    targets[range(batch_size), actions] = rewards + (1 - done) * next_q_values * gamma
                    model.fit(states, targets, epochs=1, verbose=0)

            target_model.set_weights(model.get_weights())

            print(f"Episode[{episode_id}] => Reward : {total_reward}")
    return model
