In [None]:
# imports
import random
import numpy as np

In [1]:
# a class for the space
class Space:
    """
     constructor by default the default space is used
     the space is a 2D array 4 * 4 of characters
        the characters are: 
            'S' : the starting point
            '_' : empty space
            'J' : the goal
            'D' : a dragon
    the default space is:
        S___
        D_D_ 
        ___D
        _D_J

    """
    def __init__(self):
        self.space = [
            ['S', '_', '_', '_'],
            ['D', '_', 'D', '_'],
            ['_', '_', '_', 'D'],
            ['_', 'D', '_', 'J']
        ]

    # a constructor that a number of lines and columns and a number of dragons
    def __init__(self, lines, columns, dragons):
        self.space = []
        for l in range(lines):
            self.space.append([])
            for c in range(columns):
                self.space[l].append('_')

        self.space[0][0] = 'S'
        self.space[lines-1][columns-1] = 'J'
        
        i = 0
        while i < dragons:
            l = random.randint(0, lines-1)
            c = random.randint(0, columns-1)
            if self.space[l][c] == '_':
                self.space[l][c] = 'D'
                i += 1
            else:
                i -= 1

    # a method the pretty print the space
    def print_space(self):
        for l in self.space:
            for c in l:
                print(c, end='| ')
            
            print()

    # a method to get the size of the lines
    def get_lines_size(self):
        return len(self.space)

    # a method to get the size of the columns
    def get_columns_size(self):
        return len(self.space[0])

    # a method to get the size of the space
    def get_size(self):
        return self.get_lines_size() * self.get_columns_size()
    
# a static class for the rewards
class Rewards:
    # the rewards for each character
    rewards = {
        'S': 0,
        '_': 0,
        'J': 100,
        'D': -100
    }

    # a method to get the reward of a character
    def get_reward(character):
        return Rewards.rewards.get(character)
    
# a static class for the Directions
class Directions:
    # the directions
    directions = ["HAUT", "DROITE", "BAS", "GAUCHE"]

    # a method to get the size of the directions
    def get_size():
        return len(Directions.directions)

    # a method to get the index of a direction
    def get_index(direction):
        return Directions.directions.index(direction)

    # a method to get a random direction
    def get_random_direction():
        return random.choice(Directions.directions)

    # a method to get the direction that maximizes the Q value
    def get_max_direction(mat_q, state):
        return Directions.directions[np.argmax(mat_q[state])]

# a class for the player
class Player:
    # constructor by default the player is at the starting point
    def __init__(self):
        self.position = (0, 0)

    # a constructor that takes a position
    def __init__(self, position):
        self.position = position


# a class for the game
class Game:
    # constructor that takes :
    # GAMMA : 0.96 by default
    # ALPHA : 0.81 by default
    # number of episodes : 10000 by default
    # number of steps : 100 by default
    # is_random_space : False by default
    # a Q matrix : initialized with zeros (with the size of the space and the number of directions)

    def __init__(self, GAMMA = 0.96, ALPHA = 0.81, episodes = 10000, steps = 100, is_random_space = False):
        self.GAMMA = GAMMA
        self.ALPHA = ALPHA
        self.episodes = episodes
        self.steps = steps
        self.player = Player()

        # the space
        if is_random_space:
            self.space = Space(4, 4, 3)
        else:
            self.space = Space()

    
        # the Q matrix 
        self.mat_q = np.zeros((self.space.get_lines_size * self.space.get_columns_size(), Directions.get_size()))

    # a method to update the Q matrix
    def update_q(self, action, reward, next_state):
        state = self.player.position 
        self.mat_q[state][Directions.get_index(action)] += self.ALPHA * (reward + self.GAMMA * np.max(self.mat_q[next_state]) - self.mat_q[state][Directions.get_index(action)])


In [None]:
REWARDS = {" ":0.0, "D":-1.0, "J":12.0, "S":0.0}
DIRECTIONS = [
    "HAUT",
    "DROITE",
    "BAS",
    "GAUCHE",
]

Nparties = 10000
Ncoups = 100
alpha = 0.81
gamma = 0.96

In [2]:
def init_random_space(lines, columns, nbDragon):
    space = []
    for l in range(lines):
        space.append([])
        for c in range(columns):
            space[l].append(' ')
    space[0][0] = 'S'
    space[lines-1][columns-1] = 'J'

    i = 0
    while i < nbDragon: 
        l = random.randint(0, lines-1)
        c = random.randint(0, columns-1)
        if space[l][c] == ' ':
            space[l][c] = 'D'
            i += 1
        else:
            i -= 1
    return space

def init_space():
    space = init_random_space(4, 4, 3)
    return space;

def isWin (space, position):
    (l,c) = position

    if(space[l][c]== 'J'):
        return True


# a method to apply an action to the player
# returns [position, reward, fin]
def applicaion_action(action, position, space):
    (l, c) = position
    nextPos = position

    if action == "HAUT":
        nextPos = (l-1,c)
    elif action == "DROITE":
        nextPos = (l,c+1)
    elif action == "BAS":
        nextPos = (l+1,c);
    elif action == "GAUCHE":
        nextPos = (l,c-1);

    # check if the next position is in the space
    if (nextPos[0] < len(space) and nextPos[1] < len(space) and nextPos[0] >=0 and nextPos[1] >=0 ):
        position = nextPos

        # back to the starting point if a dragon is encountered
        # get the current case in the space
        case = space[position[0]][position[1]]
        if case == 'D':
            position = (0, 0)
    
    # set the reward
    reward = Rewards.get_reward(case)

    # check if the player is at the goal
    fin = isWin(space)

    return [position, reward, fin]

In [None]:
space = init_space();
print(space)
#position ddu chevalier
player_pos = (0, 0)

# for i in range(Nparties):
#     print("------------------------------- tour ", i)
#     # on choisit une direction aléatoire
#     action = random.choice(DIRECTIONS)
#     print("action : ", action)
#     # on applique l'action
#     player_pos, reward, fin = application_action(action, player_pos, space)
#     print("position : ", player_pos)
#     print("reward : ", reward)
#     print("fin : ", fin)

#     # fin de partie
#     if fin:
#         print("fin de partie")
#         break

# 2. Développement du Q-learning

In [None]:
# a method to choose an action with the epsilon greedy policy
def choose_action(state, epsilon, mat_q):
    if random.random() < epsilon:
        return Directions.get_random_direction()
    else:
        return Directions.get_max_direction(mat_q, state)

# a method to play one step (with mat_q, state, epsilon)
def oneStep(mat_q, state, epsilon):
    action = choose_action(state, epsilon)
    (next_state, reward, fin) = applicaion_action(action, space)
    update_q(state, action, reward, next_state)
    return [next_state, reward, fin]

In [None]:
def oneStep(mat_q, state, epsilon):
    # on choisit une action
    action = choose_action(state, epsilon, mat_q)
    # on applique l'action
    new_state, reward, fin = application_action(action, state, space)
    # on met à jour la matrice Q
    #mat_q[state][DIRECTIONS.index(action)] += alpha * (reward + gamma * (mat_q[new_state][DIRECTIONS.index(choose_action(new_state, epsilon, mat_q))]) - mat_q[state][DIRECTIONS.index(action)])
    mat_q[state][DIRECTIONS.index(action)] += alpha * (reward + gamma * np.max(mat_q[new_state]) - mat_q[state][DIRECTIONS.index(action)])
    return mat_q, new_state, fin

In [None]:
# create a mat_q
mat_q = np.zeros((len(space), len(space), len(DIRECTIONS)))

totalSteps = 0

# on applique l'algorithme
for iterationPartie in range(Nparties):
    state = (0,0)
    # calcul de epsilon
    epsilon = Nparties/(Nparties+iterationPartie)
    #print("epsilon : ", epsilon)

    for iterationCoups in range(Ncoups):
        mat_q, state, fin = oneStep(mat_q, state, epsilon)
        if fin:
            #print("fin de partie en ", iterationCoups, " coups, partie ", iterationPartie)
            totalSteps += iterationCoups
            #print(mat_q)
            break

print("nombre de coups total : ", totalSteps)
print("moyenne de coups par partie : ", totalSteps/Nparties)

#print(mat_q)


In [None]:

for iterationCoups in range(Ncoups):
    mat_q, state, fin = oneStep(mat_q, state, 0)
    if fin:
        print("fin de partie en ", iterationCoups, " coups")
        break


# Deep Q-Learning

In [None]:
# IMPORTS
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import sys


: 

##### Test avec une structure 2 couches denses ayant 16 entrées (nombre de cases) et 4 sorties (4 actions)


In [None]:
# 1 modifier la fonction choose_action avec en sortir model.predict(vec_etat)
def choose_action(vect_etat, epsilon, model):
    # L'agent est dans un certain état s, on choisit une action a selon :

    # Au hasard avec une probabilité epsilon
    if random.random() < epsilon:
        # on choisit une action aléatoire
        action = random.choice(DIRECTIONS)
    else:
        # La meilleure avec une probabilité 1-epsilon
        Sortie_Q = model.predict(vect_etat, verbose=0)  # En entrée le vecteur symbolisant l'état
        action = DIRECTIONS[np.argmax(Sortie_Q)] #On sélectionne l'action associée avec la sortie max
    return action


In [None]:
# creation du model
# Une structure simple avec 16 entrées et 4 sorties, la sortie est sans activation
model = Sequential([
    Dense(4, activation='relu', input_shape=[16]),
    Dense(4, activation='relu'),
    Dense(4),
])

# En préambule création d’un second modèle
model_stable = keras.models.clone_model(model)
model_stable.set_weights(model.get_weights())

In [None]:
# choix de l'optimiseur
optimizer = tf.keras.optimizers.Adam(learning_rate=0.01) 

# on va définir la fonction de perte
loss_fn = tf.keras.losses.mean_squared_error

# on va définir la dérivé de la fonction d'activation

# relu
@tf.custom_gradient
def my_relu(x):
    y = tf.nn.relu(x)
    def grad(dy):
        return dy * tf.cast(x > 0, tf.float32)
    return y, grad

# tanh
@tf.custom_gradient
def my_tanh(x):
    y = tf.math.tanh(x)
    def grad(dy):
        return dy * (1 - y ** 2)
    return y, grad


In [None]:
# function d'initialisation
def init():
    position = (0,0)
    # Créer en entrée un vecteur de taille lines * columns avec comme nombre de sorties le nombre d'action possible 
    vect_etat = np.zeros((1,16))
    vect_etat[0, int(len(space) * position[0] + position[1])] = 1

    return position, vect_etat

In [None]:
position, vect_etat = init()

# ITERATIONS D'APPRENTISSAGE

# on applique l'algorithme
for iterationPartie in range(Nparties):
    progress = "Partie " + str(iterationPartie) + "/" + str(Nparties) 
    sys.stdout.write("\r" + progress)

    # on réinitialise la position
    position = (0,0)

    # calcul de epsilon
    epsilon = Nparties/(Nparties+iterationPartie)

    fin = False

    while not fin:
        # on choisit une action
        action = choose_action(vect_etat, epsilon, model)

        # on applique l'action
        new_position, reward, fin = application_action(action, position, space)

        # on met à jour vect_etat
        vect_etat[0, int(len(space) * position[0] + position[1])] = 0
        vect_etat[0, int(len(space) * new_position[0] + new_position[1])] = 1

        # model stable
        sortie_Q_stable = model_stable.predict(vect_etat, verbose=0)
        max_Q = np.max(sortie_Q_stable)

        target = reward + gamma * max_Q

        # descente de gradient
        with tf.GradientTape() as tape:
            predict = model(vect_etat, training=True)  #Ce que l'on pense obtenir 
            mask = tf.one_hot(DIRECTIONS.index(action), len(DIRECTIONS)) #On crée un masque pour sélectionner la sortie correspondant à l'action choisie
            val_predict = tf.reduce_sum(predict * mask, axis=1) #On sélectionne la sortie correspondant à l'action choisie
            loss = loss_fn(target, val_predict) #On calcule la perte
        
        # on applique la descente de gradient
        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables)) # optimisation des paramètres

        # on met à jour la position
        position = new_position

        # on met à jour le model stable
        if iterationPartie % 100 == 0:
            model_stable.set_weights(model.get_weights())

    
# sauvegarde du model
model.save('deep_Q_learning_model.h5')

In [None]:
# JEU

def play():

    # affichage de l'espace avec un retour à la ligne à la fin de chaque ligne
    for i in range(len(space)):
        for j in range(len(space)):
            print(space[i][j], end="| ")
        print()
    # on charge le model
    model = keras.models.load_model('deep_Q_learning_model.h5')

    # initialisation de la position
    position = (0,0)
    fin = False

    MAX_ITER = 100
    iter = 0
    while not fin and iter < MAX_ITER:
        iter += 1

        # on crée le vecteur d'état
        vect_etat = np.zeros((1,16))
        vect_etat[0, int(len(space) * position[0] + position[1])] = 1

        # on choisit une action
        action = choose_action(vect_etat, 0, model)

        # on applique l'action
        new_position, reward, fin = application_action(action, position, space)

        # on met à jour la position
        position = new_position

        # on affiche l'action choisie avec ", " en end de print pour ne pas faire de retour à la ligne
        # mais avec un retour à la ligne tout les 10 actions
        print(action, end=", ")
        if iter % 10 == 0:
            print()
        


    if fin:
        print("Victoire ! en " + str(iter) + " itérations")
    else:
        print("Défaite")
        if (iter >= MAX_ITER):
            print("Trop d'itérations")

play()