<a href="https://colab.research.google.com/github/Naegi-UHA/DeepLearning_TD1/blob/main/DeepLearning_TD1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>



> Import libraries to use



In [116]:
import numpy as np

>  # Introduction to numpy (Skip if you already are familiar)

>> Creating a 1D array

In [117]:
a = np.array([1,2,3,4])
print(a)

[1 2 3 4]


>> Creating a 2D array


In [118]:
a = np.array([[1,2],[3,4]])
print(a)

[[1 2]
 [3 4]]


>> Creating an array full of zeros


In [119]:
a = np.zeros(shape=(10))
print(a)
a = np.zeros(shape=(5,2))
print(a)

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[[0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]]


>> Infinity in numpy

In [120]:
print(np.inf)

inf


>> Max and Argmax

In [121]:
a = np.array([2,1,4,3])
print(np.max(a))
print(np.argmax(a))

4
2


>> From list to Numpy

In [122]:
l = [1,2,3,4]
print(l)
print(np.asarray(l))

[1, 2, 3, 4]
[1 2 3 4]


>> Random in numpy

In [123]:
# Array of Random integers ranging from 1 to 10 (with any size you want)
a = np.random.randint(low=1, high=10, size=(5,2))
print(a)

# Array of random elements of a list with any size you want
a = np.random.choice([0,1,2], size=(2,))

[[8 5]
 [9 8]
 [2 5]
 [1 9]
 [1 8]]


>> Shapes in numpy

In [124]:
a = np.random.randint(low=1, high=5, size=(4,2))
print(a.shape)
print(a)

# Reshape a to a vector of shape = (8,1)
a = a.reshape((8,1))
print(a.shape)
print(a)

(4, 2)
[[1 2]
 [1 4]
 [4 1]
 [4 4]]
(8, 1)
[[1]
 [2]
 [1]
 [4]
 [4]
 [1]
 [4]
 [4]]


# Pre-defined utilities

In [125]:

int_to_char = {
    0 : 'u',
    1 : 'r',
    2 : 'd',
    3 : 'l'
}

policy_one_step_look_ahead = {
    0 : [-1,0],
    1 : [0,1],
    2 : [1,0],
    3 : [0,-1]
}

def policy_int_to_char(pi,n):

    pi_char = ['']

    for i in range(n):
        for j in range(n):

            if i == 0 and j == 0 or i == n-1 and j == n-1:

                continue

            pi_char.append(int_to_char[pi[i,j]])

    pi_char.append('')

    return np.asarray(pi_char).reshape(n,n)

def next_state(i, j, action, n):
    """
      Cette fonction calcule l'état suivant en fonction dde l'action
    """
    di, dj = policy_one_step_look_ahead[action]
    ni = min(max(i + di, 0), n - 1)
    nj = min(max(j + dj, 0), n - 1)
    return ni, nj

# 1- Policy evaluation

In [126]:
def policy_evaluation(n,pi,v,Gamma,threshhold):
  """
    This function should return the value function that follows the policy pi.
    Use the stopping criteria given in the problem statement.
  """

  # cases spéciales
  special_states = {
      (2,2)
  }

  while True:
        delta = 0
        v_new = v.copy()

        for i in range(n):
            for j in range(n):
                # État terminal
                if (i == 0 and j == 0) or (i == n-1 and j == n-1):
                    v_new[i, j] = 0
                    continue

                a = pi[i, j]
                ni, nj = next_state(i, j, a, n)

                # coût spécial ou non
                if (i, j) in special_states:
                    reward = 5
                else:
                    reward = -1

                v_new[i, j] = reward + Gamma * v[ni, nj]

                delta = max(delta, abs(v_new[i, j] - v[i, j]))

        v = v_new

        if delta <= threshhold:
            return v

# 2- Policy improvement

In [127]:
def policy_improvement(n,pi,v,Gamma):
  """
    This function should return the new policy by acting in a greedy manner.
    The function should return as well a flag indicating if the output policy
    is the same as the input policy.

    Example:
      return new_pi, True if new_pi = pi for all states
      else return new_pi, False
  """

  policy_stable = True
  new_pi = pi.copy()

  for i in range(n):
      for j in range(n):
        # États terminaux : pas d'amélioration
        if (i == 0 and j == 0) or (i == n-1 and j == n-1):
            continue

        old_action = pi[i, j]

        values = []

        for action in [0, 1, 2, 3]:
            ni, nj = next_state(i, j, action, n)
            q = -1 + Gamma * v[ni, nj]
            values.append(q)

        best_action = np.argmax(values)
        new_pi[i, j] = best_action

        if best_action != old_action:
            policy_stable = False

  return new_pi, policy_stable

# 3- Policy Initialization

In [128]:
def policy_initialization(n):
  """
    This function should return the initial random policy for all states.
  """

  pi = np.zeros((n, n), dtype=int)

  for i in range(n):
      for j in range(n):
          # États terminaux : pas d'action utile
          if (i == 0 and j == 0) or (i == n-1 and j == n-1):
              pi[i, j] = -1  # convention : -1 signifie "pas d'action"
          else:
              pi[i, j] = np.random.choice([0, 1, 2, 3])

  return pi

# 4- Policy Iteration algorithm

In [129]:
def policy_iteration(n,Gamma,threshhold):

    pi = policy_initialization(n=n)

    v = np.zeros(shape=(n,n))

    while True:

        v = policy_evaluation(n=n,v=v,pi=pi,threshhold=threshhold,Gamma=Gamma)

        pi , pi_stable = policy_improvement(n=n,pi=pi,v=v,Gamma=Gamma)

        if pi_stable:

            break

    return pi , v

# Main Code to Test

In [None]:
n = 5

Gamma = [0, 0.8, 0.9, 0.99]

threshhold = 1e-4

for _gamma in Gamma:

    pi , v = policy_iteration(n=n,Gamma=_gamma,threshhold=threshhold)

    pi_char = policy_int_to_char(n=n,pi=pi)

    print()
    print("Gamma = ",_gamma)

    print()

    print(pi_char)

    print()
    print()

    print(v)

"""
    Résultat :

    La case du milieu donne un bonus de récompense au lieu d'appliquer -1
    
    Avec un gamma élevé, l'agent va volontairement enchainer les aller retours sur cette case 
    pour booster son score au lieu de rejoindre immédiatement un des points terminaux.
"""


Gamma =  0

[['' 'u' 'u' 'u' 'u']
 ['u' 'u' 'u' 'u' 'u']
 ['u' 'u' 'u' 'u' 'u']
 ['u' 'u' 'u' 'u' 'u']
 ['u' 'u' 'u' 'u' '']]


[[ 0. -1. -1. -1. -1.]
 [-1. -1. -1. -1. -1.]
 [-1. -1.  5. -1. -1.]
 [-1. -1. -1. -1. -1.]
 [-1. -1. -1. -1.  0.]]

Gamma =  0.8

[['' 'r' 'd' 'd' 'd']
 ['r' 'r' 'd' 'd' 'd']
 ['r' 'r' 'u' 'l' 'l']
 ['u' 'u' 'u' 'u' 'u']
 ['u' 'u' 'u' 'u' '']]


[[ 0.          3.53328345  5.66662676  3.53328345  1.82662676]
 [ 3.53328345  5.66662676  8.33328345  5.66662676  3.53328345]
 [ 5.66662676  8.33328345 11.66662676  8.33328345  5.66662676]
 [ 3.53328345  5.66662676  8.33328345  5.66662676  3.53328345]
 [ 1.82662676  3.53328345  5.66662676  3.53328345  0.        ]]

Gamma =  0.9

[['' 'r' 'd' 'd' 'd']
 ['r' 'r' 'd' 'd' 'd']
 ['r' 'r' 'u' 'l' 'l']
 ['u' 'u' 'u' 'u' 'u']
 ['u' 'u' 'u' 'u' '']]


[[ 0.         13.02083943 15.57871048 13.02083943 10.71871048]
 [13.02083943 15.57871048 18.42083943 15.57871048 13.02083943]
 [15.57871048 18.42083943 21.57871048 18.42083943 1