# Projet Deep Reinforcement Learning

##  l'Environnement Line World

#### Import des biblio

In [222]:
import numpy as np
import time
from typing import Callable

#### Initiale les variables d'environnement

In [223]:
num_states = 5
S = np.arange(num_states)
A = np.array([0, 1])  # 0: left, 1 : right
T = np.array([0, num_states - 1])
P = np.zeros((len(S), len(A), len(S), 2))

In [224]:
for s in S[1:-1]:
    P[s, 0, s - 1, 0] = 1.0
    P[s, 1, s + 1, 0] = 1.0
P[1, 0, 0, 1] = -1.0
P[num_states - 2, 1, num_states - 1, 1] = 1.0

#### definir les fonction d'environnement

In [225]:
def reset() -> int:
    return num_states // 2


def is_terminal(state: int) -> bool:
    return state in T


def step(state: int, action: int) -> (int, float):
    assert not is_terminal(state)
    next_state = np.random.choice(S, p=P[state, action, :, 0])
    reward = P[state, action, next_state, 1]
    return next_state, reward

#### definire le policy randon uniform

In [226]:
def tabular_random_uniform_policy(state_size: int, action_size: int) -> np.ndarray:
    assert action_size > 0
    return np.ones((state_size, action_size,)) / action_size

#### Implement l'algorithms iterative policy evaluation

In [227]:
def iterative_policy_evaluation(
        S: np.ndarray,
        A: np.ndarray,
        P: np.ndarray,
        T: np.ndarray,
        Pi: np.ndarray,
        gamma: float = 0.99,
        theta: float = 0.00001
) -> np.ndarray:
    assert theta > 0
    assert 0 <= gamma <= 1
    V = np.random.random((S.shape[0],))
    V[T] = 0.0
    while True:
        delta = 0
        for s in S:
            v_temp = V[s]
            new_v = 0
            for a in A:
                for s_p in S:
                    new_v += Pi[s, a] * P[s, a, s_p, 0] * (
                            P[s, a, s_p, 1] + gamma * V[s_p]
                    )
            V[s] = new_v
            delta = np.maximum(delta, np.abs(v_temp - new_v))
        if delta < theta:
            break
    return V

#### test l'algorithms iterative policy evaluation

In [228]:
print("Evaluation policy random :")
Pi = tabular_random_uniform_policy(S.shape[0], A.shape[0])
V = iterative_policy_evaluation(S, A, P, T, Pi)
print(V)

Evaluation policy random :
[ 0.00000000e+00 -4.99994602e-01  5.34431323e-06  5.00002645e-01
  0.00000000e+00]


In [229]:
print('Evaluation Policy "Toujours vers la droite !" :')
Pi = np.zeros((S.shape[0], A.shape[0]))
Pi[1:-1, 1] = 1.0
V = iterative_policy_evaluation(S, A, P, T, Pi)
print(V)

Evaluation Policy "Toujours vers la droite !" :
[0.     0.9801 0.99   1.     0.    ]


In [230]:
print('Evaluation Policy "Toujours vers la gauche !" :')
Pi = np.zeros((S.shape[0], A.shape[0]))
Pi[1:-1, 0] = 1.0
V = iterative_policy_evaluation(S, A, P, T, Pi)
print(V)

Evaluation Policy "Toujours vers la gauche !" :
[ 0.     -1.     -0.99   -0.9801  0.    ]


#### Implement l'algorithms policy iteration

In [231]:
def policy_iteration(
        S: np.ndarray,
        A: np.ndarray,
        P: np.ndarray,
        T: np.ndarray,
        gamma: float = 0.99,
        theta: float = 0.00001
) -> (np.ndarray, np.ndarray):
    Pi = tabular_random_uniform_policy(S.shape[0], A.shape[0])
    while True:
        V = iterative_policy_evaluation(S, A, P, T, Pi, gamma, theta)
        policy_stable = True
        for s in S:
            old_action = np.argmax(Pi[s])
            best_action = 0
            best_action_score = -9999999999
            for a in A:
                tmp_sum = 0
                for s_p in S:
                    tmp_sum += Pi[s, a] * P[s, a, s_p, 0] * (
                            P[s, a, s_p, 1] + gamma * V[s_p]
                    )
                if tmp_sum > best_action_score:
                    best_action = a
                    best_action_score = tmp_sum
            Pi[s] = 0.0
            Pi[s, best_action] = 1.0
            if old_action != best_action:
                policy_stable = False
        if policy_stable:
            break
    V = iterative_policy_evaluation(S, A, P, T, Pi, gamma, theta)
    return V, Pi

#### test l'algorithms policy iteration

In [232]:
t1 = time.time()
V, Pi = policy_iteration(S, A, P, T)
print(V)
print(Pi)
print(f"time : {time.time() - t1}")

[0.     0.9801 0.99   1.     0.    ]
[[1. 0.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [1. 0.]]
time : 0.003989219665527344


#### Implement l'algorithms value iteration

In [233]:
def value_iteration(
    S: np.ndarray,
    A: np.ndarray,
    P: np.ndarray,
    T: np.ndarray,
    gamma: float = 0.99,
    theta: float = 0.00001
) -> np.ndarray:
    V = np.random.random((S.shape[0],))
    V[T] = 0.0
    while True:
        delta = 0
        for s in S:
            v_temp = V[s]
            v_max = -99999999
            for a in A:
                v_max_temp = 0
                for s_p in S:
                    v_max_temp += P[s, a, s_p, 0] * (
                            P[s, a, s_p, 1] + gamma * V[s_p]
                    )
                if(v_max < v_max_temp):
                    v_max = v_max_temp
            V[s] = v_max
            delta = np.maximum(delta, np.abs(v_temp - v_max))

        if delta < theta:
            break
    
    Pi = tabular_random_uniform_policy(S.shape[0], A.shape[0])
    for s in S:
        old_action = np.argmax(Pi[s])
        best_action = 0
        best_action_score = -9999999999
        for a in A:
            tmp_sum = 0
            for s_p in S:
                tmp_sum += Pi[s, a] * P[s, a, s_p, 0] * (
                        P[s, a, s_p, 1] + gamma * V[s_p]
                )
            if tmp_sum > best_action_score:
                best_action = a
                best_action_score = tmp_sum
        Pi[s] = 0.0
        Pi[s, best_action] = 1.0
    return V, Pi

#### test l'algorithms value iteration

In [234]:
t1 = time.time()
V, Pi = value_iteration(S, A, P, T)
print(V)
print(Pi)
print(f"le time d'execution : {time.time() - t1}")

[0.     0.9801 0.99   1.     0.    ]
[[1. 0.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [1. 0.]]
le time d'execution : 0.0009968280792236328


#### definir un utile pour simules des step

In [235]:
def step_until_the_end_of_the_episode_and_generate_trajectory(
        s0: int,
        pi: np.ndarray,
        step_func: Callable,
        is_terminal_func: Callable,
        max_steps: int = 10
) -> ([int], [int], [int], [float]):
    s_list = []
    a_list = []
    s_p_list = []
    r_list = []
    st = s0
    actions = np.arange(pi.shape[1])
    step = 0
    while not is_terminal_func(st) and step < max_steps:
        at = np.random.choice(actions, p=pi[st])
        st_p, rt_p = step_func(st, at)
        s_list.append(st)
        a_list.append(at)
        s_p_list.append(st_p)
        r_list.append(rt_p)
        st = st_p
        step += 1

    return s_list, a_list, s_p_list, r_list

#### Implement l'algorithms monte carlo es

In [236]:
def monte_carlo_es(
        states_count: int,
        actions_count: int,
        step_func: Callable,
        is_terminal_func: Callable,
        max_episodes: int = 1000,
        max_steps_per_episode: int = 10,
        gamma: float = 0.99
) -> (np.ndarray, np.ndarray):
    pi = tabular_random_uniform_policy(states_count, actions_count)
    states = np.arange(states_count)
    actions = np.arange(actions_count)

    Q = np.random.random((states_count, actions_count))

    for s in states:
        if is_terminal_func(s):
            Q[s, :] = 0.0
            pi[s, :] = 0.0

    returns = np.zeros((states_count, actions_count))
    returns_count = np.zeros((states_count, actions_count))

    for episode_id in range(max_episodes):
        s0 = np.random.choice(states)

        if is_terminal_func(s0):
            episode_id -= 1
            continue

        a0 = np.random.choice(actions)

        s1, r1 = step_func(s0, a0)

        s_list, a_list, _, r_list = step_until_the_end_of_the_episode_and_generate_trajectory(s1, pi, step_func,
                                                                                              is_terminal_func,
                                                                                              max_steps_per_episode)
        s_list.insert(0, s0)
        a_list.insert(0, a0)
        r_list.insert(0, r1)

        G = 0.0
        for t in reversed(range(len(s_list))):
            G = gamma * G + r_list[t]
            st = s_list[t]
            at = a_list[t]
            if (st, at) in zip(s_list[0:t], a_list[0:t]):
                continue

            returns[st, at] += G
            returns_count[st, at] += 1
            Q[st, at] = returns[st, at] / returns_count[st, at]
            pi[st, :] = 0.0
            pi[st, np.argmax(Q[st, :])] = 1.0
    return Q, pi

#### test l'algorithms monte_carlo_es

In [237]:
t1 = time.time()
Q, Pi = monte_carlo_es(len(S), len(A), step, is_terminal,
                                                      max_episodes=10000, max_steps_per_episode=100)
print(Q)
print(Pi)
print(f"le time d'execution : {time.time() - t1}")

[[ 0.          0.        ]
 [-1.          0.98008031]
 [ 0.97027907  0.99      ]
 [ 0.9801      1.        ]
 [ 0.          0.        ]]
[[0. 0.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 0.]]
le time d'execution : 0.6852025985717773


#### Implement l'algorithms on policy first visit monte carlo

In [238]:
def on_policy_first_visit_monte_carlo(
        states_count: int,
        actions_count: int,
        reset_func: Callable,
        step_func: Callable,
        is_terminal_func: Callable,
        max_episodes: int = 1000,
        max_steps_per_episode: int = 10,
        gamma: float = 0.99,
        epsilon: float = 0.1
) -> (np.ndarray, np.ndarray):
    pi = tabular_random_uniform_policy(states_count, actions_count)
    states = np.arange(states_count)
    actions = np.arange(actions_count)

    Q = np.random.random((states_count, actions_count))

    for s in states:
        if is_terminal_func(s):
            Q[s, :] = 0.0
            pi[s, :] = 0.0

    returns = np.zeros((states_count, actions_count))
    returns_count = np.zeros((states_count, actions_count))

    for episode_id in range(max_episodes):
        s0 = reset_func()

        s_list, a_list, _, r_list = step_until_the_end_of_the_episode_and_generate_trajectory(s0, pi, step_func,
                                                                                              is_terminal_func,
                                                                                              max_steps_per_episode)

        G = 0.0
        for t in reversed(range(len(s_list))):
            G = gamma * G + r_list[t]
            st = s_list[t]
            at = a_list[t]
            if (st, at) in zip(s_list[0:t], a_list[0:t]):
                continue

            returns[st, at] += G
            returns_count[st, at] += 1
            Q[st, at] = returns[st, at] / returns_count[st, at]
            pi[st, :] = epsilon / actions_count
            pi[st, np.argmax(Q[st, :])] = 1.0 - epsilon + epsilon / actions_count
    return Q, pi

#### test l'algorithms on policy first visit monte carlo

In [239]:
t1 = time.time()
Q, Pi = on_policy_first_visit_monte_carlo(len(S), len(A),
                                            reset,
                                            step,
                                            is_terminal,
                                            max_episodes=10000, max_steps_per_episode=100)
print(Q)
print(Pi)
print(f"le time d'execution : {time.time() - t1}")

[[ 0.          0.        ]
 [-1.          0.97397235]
 [ 0.85655042  0.98861531]
 [ 0.97387833  1.        ]
 [ 0.          0.        ]]
[[0.   0.  ]
 [0.05 0.95]
 [0.05 0.95]
 [0.05 0.95]
 [0.   0.  ]]
le time d'execution : 1.1858654022216797


#### Implement l'algorithms off policy monte carlo control

In [240]:
def off_policy_monte_carlo_control(
        states_count: int,
        actions_count: int,
        reset_func: Callable,
        step_func: Callable,
        is_terminal_func: Callable,
        max_episodes: int = 1000,
        max_steps_per_episode: int = 10,
        gamma: float = 0.99,
        epsilon: float = 0.1,
        epsilon_greedy_behaviour_policy: bool = False
) -> (np.ndarray, np.ndarray):
    b = tabular_random_uniform_policy(states_count, actions_count)
    pi = tabular_random_uniform_policy(states_count, actions_count)
    states = np.arange(states_count)

    Q = np.random.random((states_count, actions_count))

    for s in states:
        if is_terminal_func(s):
            Q[s, :] = 0.0
            pi[s, :] = 0.0
        pi[s, :] = 0
        pi[s, np.argmax(Q[s, :])] = 1.0

    C = np.zeros((states_count, actions_count))

    for episode_id in range(max_episodes):
        if epsilon_greedy_behaviour_policy:
            for s in states:
                b[s, :] = epsilon / actions_count
                b[s, np.argmax(Q[s, :])] = 1.0 - epsilon + epsilon / actions_count

        s0 = reset_func()

        s_list, a_list, _, r_list = step_until_the_end_of_the_episode_and_generate_trajectory(s0, b, step_func,
                                                                                              is_terminal_func,
                                                                                              max_steps_per_episode)

        G = 0.0
        W = 1
        for t in reversed(range(len(s_list))):
            G = gamma * G + r_list[t]
            st = s_list[t]
            at = a_list[t]
            C[st, at] += W

            Q[st, at] += W / C[st, at] * (G - Q[st, at])
            pi[st, :] = 0
            pi[st, np.argmax(Q[st, :])] = 1.0
            if np.argmax(Q[st, :]) != at:
                break
            W = W / b[st, at]
    return Q, pi

#### test l'algorithms off policy monte carlo control

In [241]:
t1 = time.time()
Q, Pi = off_policy_monte_carlo_control(len(S), len(A),
                                           reset,
                                           step,
                                           is_terminal,
                                           max_episodes=10000, max_steps_per_episode=100)
print(Q)
print(Pi)
print(f"le time d'execution : {time.time() - t1}")

[[ 0.        0.      ]
 [-1.        0.9801  ]
 [ 0.970299  0.99    ]
 [ 0.9801    1.      ]
 [ 0.        0.      ]]
[[1. 0.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [1. 0.]]
le time d'execution : 2.143306016921997
