# Apprentissage par renforcement "le jeu du Frozen Lake"

# importation du gym et des autres librairies nécessaires

In [14]:
import gym
import numpy as np
import random
import time
from gym import utils
from gym.envs.registration import register

# Fonction d'affichag

In [19]:
MAP = {0: "Gauche", 1: "Bas", 2: "Droite", 3: "Haut"}

def map_action(action_int):
    return MAP.get(action_int, None)

def my_render(env):
    my_env = env
    row, col = my_env.s // my_env.ncol, my_env.s % my_env.ncol
    desc = my_env.desc.tolist()
    desc = [[c.decode('utf-8') for c in line] for line in desc]
    desc[row][col] = utils.colorize(desc[row][col], "red", highlight=True)
    print("\nDernière action faite: {}".format(map_action(my_env.lastaction)))
    grid = "|" + "|\n|".join(''.join(line) for line in desc) + "|"
    grid = grid.replace("F", " ").replace("H", "X")
    print(grid)

# 1.Initialisation de l'environnement

In [3]:
register(
   id="FrozenLakeNotSlippery-v0",
   entry_point='gym.envs.toy_text:FrozenLakeEnv',
   kwargs={'map_name': '4x4', 'is_slippery': False},
)

env = gym.make("FrozenLakeNotSlippery-v0")

# 1.1. Nombre d'états dans l'environnement  

In [4]:
state_size = env.observation_space.n

# 1.2. Nombre d'actions possibles dans chaque état

In [5]:
action_size = env.action_space.n

# 1.3. Création de la Q-table et initialisation des Q-values 


In [6]:
Q_table = np.zeros((state_size, action_size))
rewards = []

# 1.4. Nombre d'episode maximun

In [7]:
MAX_EPISODES = 15000

# 1.5. Taux d'apprentissage (ou learning rate -> alpha) et facteur d'actualisation (ou discount factor -> gamma)

In [8]:
ALPHA = 0.8
GAMMA = 0.95

# 1.6. Initialisation d'epsilon (epsilon - greedy policy)

In [9]:
EPSILON = 1.0
MAX_EPSILON = 1.0
MIN_EPSILON = 0.01
DECAY_RATE = 0.005

# 2. Exécution de l'action dans l'environnement et modification de la reward par défaut

In [16]:
def take_action(action, env):
    new_state, reward, done, info = env.step(action)
    # Reward function
    # If new_state is a hole
    if new_state in [5, 7, 11, 12]:
        reward = -1
    # else if new_state is the arrival
    elif new_state == 15:
        reward = 1
    # else penalize search
    else:
        reward = -0.01
    return new_state, reward, done, info

In [None]:
for episode in range(MAX_EPISODES):

    S = env.reset()
    step = 0
    done = False
    total_rewards = 0

    while not done:
        # ETAPE 1
        if random.uniform(0, 1) < EPSILON:
            A = env.action_space.sample()
        else:
            A = np.argmax(Q_table[S, :])
        # ETAPE 2
        S_, R, done, info = take_action(A, env)
        # ETAPE 3
        q_predict = Q_table[S, A]
        if done:
            q_target = R
        else:
            q_target = R + GAMMA * np.max(Q_table[S_, :])
        Q_table[S, A] += ALPHA * (q_target - q_predict)
        total_rewards += R
        S = S_
 
        # Fonction d'affichage par défaut
        #env.render()
        # Fonction d'affichage "custom" / plus lisible
        my_render(env)
        time.sleep(0.1)

    # Epsilon decay
    EPSILON = MIN_EPSILON + (MAX_EPSILON - MIN_EPSILON) * np.exp(-DECAY_RATE * episode)
    rewards.append(total_rewards)


Dernière action faite: Bas
|S   |
|[41m [0mX X|
|   X|
|X  G|

Dernière action faite: Haut
|[41mS[0m   |
| X X|
|   X|
|X  G|

Dernière action faite: Bas
|S   |
|[41m [0mX X|
|   X|
|X  G|

Dernière action faite: Gauche
|S   |
|[41m [0mX X|
|   X|
|X  G|

Dernière action faite: Haut
|[41mS[0m   |
| X X|
|   X|
|X  G|

Dernière action faite: Bas
|S   |
|[41m [0mX X|
|   X|
|X  G|

Dernière action faite: Haut
|[41mS[0m   |
| X X|
|   X|
|X  G|

Dernière action faite: Droite
|S[41m [0m  |
| X X|
|   X|
|X  G|

Dernière action faite: Droite
|S [41m [0m |
| X X|
|   X|
|X  G|

Dernière action faite: Bas
|S   |
| X[41m [0mX|
|   X|
|X  G|

Dernière action faite: Bas
|S   |
| X X|
|  [41m [0mX|
|X  G|

Dernière action faite: Bas
|S   |
| X X|
|   X|
|X [41m [0mG|

Dernière action faite: Droite
|S   |
| X X|
|   X|
|X  [41mG[0m|

Dernière action faite: Bas
|S   |
|[41m [0mX X|
|   X|
|X  G|

Dernière action faite: Droite
|S   |
| [41mX[0m X|
|   X|
|X  G|

Derniè