In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
import gym

LEFT = 0
DOWN = 1
RIGHT = 2
UP = 3

policy_1 = {
    0: RIGHT,
    1: RIGHT, 
    2: RIGHT,
    3: DOWN, 
    4: RIGHT,
    5: RIGHT,
    6: RIGHT,
    7: DOWN,
    9: RIGHT,
    10: RIGHT,
}

policy_2 = {
    0: DOWN,
    1: DOWN, 
    2: DOWN,
    3: DOWN, 
    4: RIGHT,
    5: DOWN,
    6: DOWN,
    7: DOWN,
    9: RIGHT,
    10: RIGHT,
}

policy_3 = {
    0: RIGHT,
    1: RIGHT, 
    2: RIGHT,
    3: DOWN, 
    4: RIGHT,
    5: DOWN,
    6: LEFT,
    7: LEFT,
    9: RIGHT,
    10: RIGHT,
}

policy_4 = {
    0: DOWN,
    1: LEFT, 
    2: LEFT,
    3: LEFT, 
    4: DOWN,
    5: LEFT,
    6: LEFT,
    7: LEFT,
    9: LEFT,
    10: LEFT,
}

class RewardWrapper(gym.RewardWrapper):
    def __init__(self, env, step_penalty=0.01):
        self.step_penalty = step_penalty
        super().__init__(env)
    
    def reward(self, rew):
        # modify rew
        if rew == 0:
            rew = -self.step_penalty
        return rew
    
class ResetWrapper(gym.Wrapper):
    def __init__(self, env):
        super().__init__(env)
    
    def reset(self, start_pos=0):
        super().reset()
        self.env.env.env.s = start_pos
        return start_pos

def get_frozenlake_env(is_slippery, step_penalty=0.01, custom_map = ['SFFF', 'FFFF', 'HFFG']):
    env = gym.make("FrozenLake-v0", desc=custom_map, is_slippery=is_slippery)
    env = RewardWrapper(env, step_penalty=step_penalty)
    env = ResetWrapper(env)
    return env

In [4]:
#from frozenlake_helper import get_frozenlake_env, policy_1, policy_2, policy_3, policy_4
import numpy as np

In [5]:
LEFT = 0
DOWN = 1
RIGHT = 2
UP = 3

In [6]:
policies = [policy_1, policy_2, policy_3, policy_4]

# Instancio entorno

In [7]:
env = get_frozenlake_env(is_slippery=False, step_penalty=0)

### Algunas pruebas para familiarizarse con los entornos de openAI

In [8]:
start_position = 1
env.reset(start_position)
env.render()

# Genéro acción y recibo siguiente estado, recompenza y si terminó el episodio
obs, reward, done, info = env.step(DOWN)
print()
print(obs, reward, done, info)
print()
env.render()


S[41mF[0mFF
FFFF
HFFG

5 0 False {'prob': 1.0}

  (Down)
SFFF
F[41mF[0mFF
HFFG


In [9]:
env.step(RIGHT)
print(obs, reward, done, info)
env.render()

5 0 False {'prob': 1.0}
  (Right)
SFFF
FF[41mF[0mF
HFFG


# Armar una función que corra un episodio completo y devuelta el retorno acumulado

Recibe el entorno, la política y la posición inicial del robot
Devuelve el retorno (suma de los rewards)

In [10]:
def get_discounted_return(r, gamma=0.9):
    # Por si es una lista
    r = np.array(r, dtype=float)
    """Take 1D float array of rewards and compute discounted reward """
    discounted_r = np.zeros_like(r)
    running_add = 0
    for t in reversed(range(0, r.size)):
        running_add = running_add * gamma + r[t]
        discounted_r[t] = running_add
    return discounted_r[0]

def run_episode(env, policy, start_pos, gamma=1.0):
    obs = env.reset(start_pos)
    done = False
    total_return = []
    rewards = []
    while not done:
        action = policy[obs]
        obs, reward, done, info = env.step(action)
        rewards.append(reward)
    return get_discounted_return(rewards, gamma)

# Ejercicio 1:
- Entorno determinístico
- Penalidad del paso = 0 
- Sin discount
- Partiendo de posición 0: (0,0)

In [11]:
start_pos = 0
step_penalty = 0

In [12]:
env = get_frozenlake_env(False, step_penalty=step_penalty)

### Evaluar las distintas políticas pariendo desde la posición 0

In [13]:
for i, policy in enumerate(policies):
    r = run_episode(env, policy=policy, start_pos=start_pos)
    print(f'Return policy_{i+1}:', r)

Return policy_1: 1.0
Return policy_2: 1.0
Return policy_3: 1.0
Return policy_4: 0.0


# Ejercicio 2:
- Entorno determinístico
- Penalidad del paso = 0.01 
- Sin discount
- Partiendo de posición 0: (0,0)

In [14]:
start_pos = 0
step_penalty = 0.01
env = get_frozenlake_env(False, step_penalty=step_penalty)

In [15]:
for i, policy in enumerate(policies):
    r = run_episode(env, policy=policy, start_pos=start_pos)
    print(f'Return policy_{i+1}:', r)

Return policy_1: 0.96
Return policy_2: 0.96
Return policy_3: 0.9199999999999999
Return policy_4: -0.02


# Ejercicio 3:
- Entorno determinístico
- Penalidad del paso = 0 
- gamma = 0.9
- Partiendo de posición 0: (0,0)

### En este entorne debe crear la función get_discounted_return y modificar run_episode acorde

In [16]:
def get_discounted_return(r, gamma=0.9):
    # Por si es una lista
    r = np.array(r, dtype=float)
    """Take 1D float array of rewards and compute discounted reward """
    discounted_r = np.zeros_like(r)
    running_add = 0
    for t in reversed(range(0, r.size)):
        running_add = running_add * gamma + r[t]
        discounted_r[t] = running_add
    return discounted_r[0]

In [17]:
def run_episode(env, policy, start_pos, gamma=1.0):
    obs = env.reset(start_pos)
    done = False
    rewards = []
    while not done:
        action = policy[obs]
        obs, reward, done, info = env.step(action)
        rewards.append(reward)
    return get_discounted_return(rewards, gamma)

In [18]:
start_pos = 0
step_penalty = 0
gamma = 0.9
env = get_frozenlake_env(False, step_penalty=step_penalty)

In [19]:
for i, policy in enumerate(policies):
    r = run_episode(env, policy=policy, start_pos=start_pos, gamma=gamma)
    print(f'Return policy_{i+1}:', r)

Return policy_1: 0.6561000000000001
Return policy_2: 0.6561000000000001
Return policy_3: 0.43046721000000016
Return policy_4: 0.0


# Ejercicio 4:
- Entorno aleatorio
- Penalidad del paso = 0 
- gamma = 1.0 (sin discount)
- Partiendo de posición 0: (0,0)

In [22]:
start_pos = 0
step_penalty = 0
gamma = 1.0
env = get_frozenlake_env(True, step_penalty=step_penalty)

### En este caso cuando se decide una acción, el agente se moverá hacia el lugar indicado con una probabilidad de 0.33, y se moverá hacia cualquiera de los costados con probabilidad 0.33

### Cada vez que corra un episodio obtendrá un resultado diferente, intentelo

In [26]:
# Correr varias veces y ver que el resultado cambia con cada iteración
for i, policy in enumerate(policies):
    r = run_episode(env, policy=policy, start_pos=start_pos, gamma=gamma)
    print(f'Return policy_{i+1}:', r)

Return policy_1: 1.0
Return policy_2: 0.0
Return policy_3: 1.0
Return policy_4: 0.0


### Armar una función que corra el episodio N veces y devuleva los retornos, la media y desvio

In [27]:
def get_expected_return(env, policy, N=5000, start_pos=0, gamma=1.0):
    rewards = []
    
    for i in range(N):
        reward = run_episode(env, policy, start_pos=start_pos, gamma=gamma)
        rewards.append(reward)

    return rewards, np.mean(rewards), np.std(rewards)

In [28]:
for i, policy in enumerate(policies):
    r = get_expected_return(env, policy, N=10_000, start_pos=start_pos, gamma=gamma)
    print(f'Return policy_{i+1}:', r[1], r[2])

Return policy_1: 0.8041 0.39689191223808024
Return policy_2: 0.5436 0.4980954125466324
Return policy_3: 0.5797 0.49360704006324707
Return policy_4: 0.0 0.0


In [29]:
for i, policy in enumerate(policies):
    r = get_expected_return(env, policy, N=10_000, start_pos=start_pos, gamma=gamma)
    print(f'Return policy_{i+1}:', r[1], r[2])

Return policy_1: 0.8018 0.39864365039468524
Return policy_2: 0.5326 0.49893610813409767
Return policy_3: 0.5704 0.49501902993723385
Return policy_4: 0.0 0.0


# Ejercicio 5:
- Entorno determinístico
- Penalidad del paso = 0 
- gamma = 0.9 (sin discount)
- Partiendo de posición 0: (0,0)

### Calcular la value-function de todas los politicas

In [30]:
step_penalty = 0
gamma = 0.9
env = get_frozenlake_env(False, step_penalty=step_penalty)

In [31]:
for i, policy in enumerate(policies):
    print(f'Return policy_{i+1}:')
    for start_pos in range(12):
        if start_pos in policy:
            r = run_episode(env, policy=policy, start_pos=start_pos, gamma=gamma)
            print(start_pos, r)
    print()

Return policy_1:
0 0.6561000000000001
1 0.7290000000000001
2 0.81
3 0.9
4 0.7290000000000001
5 0.81
6 0.9
7 1.0
9 0.9
10 1.0

Return policy_2:
0 0.6561000000000001
1 0.7290000000000001
2 0.81
3 0.9
4 0.7290000000000001
5 0.81
6 0.9
7 1.0
9 0.9
10 1.0

Return policy_3:
0 0.43046721000000016
1 0.47829690000000014
2 0.5314410000000002
3 0.5904900000000002
4 0.7290000000000001
5 0.81
6 0.7290000000000001
7 0.6561000000000001
9 0.9
10 1.0

Return policy_4:
0 0.0
1 0.0
2 0.0
3 0.0
4 0.0
5 0.0
6 0.0
7 0.0
9 0.0
10 0.0

