In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
import gym

LEFT = 0
DOWN = 1
RIGHT = 2
UP = 3

policy_1 = {
    0: RIGHT,
    1: RIGHT, 
    2: RIGHT,
    3: DOWN, 
    4: RIGHT,
    5: RIGHT,
    6: RIGHT,
    7: DOWN,
    9: RIGHT,
    10: RIGHT,
}

policy_2 = {
    0: DOWN,
    1: DOWN, 
    2: DOWN,
    3: DOWN, 
    4: RIGHT,
    5: DOWN,
    6: DOWN,
    7: DOWN,
    9: RIGHT,
    10: RIGHT,
}

policy_3 = {
    0: RIGHT,
    1: RIGHT, 
    2: RIGHT,
    3: DOWN, 
    4: RIGHT,
    5: DOWN,
    6: LEFT,
    7: LEFT,
    9: RIGHT,
    10: RIGHT,
}

policy_4 = {
    0: DOWN,
    1: LEFT, 
    2: LEFT,
    3: LEFT, 
    4: DOWN,
    5: LEFT,
    6: LEFT,
    7: LEFT,
    9: LEFT,
    10: LEFT,
}

class RewardWrapper(gym.RewardWrapper):
    def __init__(self, env, step_penalty=0.01):
        self.step_penalty = step_penalty
        super().__init__(env)
    
    def reward(self, rew):
        # modify rew
        if rew == 0:
            rew = -self.step_penalty
        return rew
    
class ResetWrapper(gym.Wrapper):
    def __init__(self, env):
        super().__init__(env)
    
    def reset(self, start_pos=0):
        super().reset()
        self.env.env.env.s = start_pos
        return start_pos

def get_frozenlake_env(is_slippery, step_penalty=0.01, custom_map = ['SFFF', 'FFFF', 'HFFG']):
    env = gym.make("FrozenLake-v0", desc=custom_map, is_slippery=is_slippery)
    env = RewardWrapper(env, step_penalty=step_penalty)
    env = ResetWrapper(env)
    return env

In [4]:
#from frozenlake_helper import get_frozenlake_env, policy_1, policy_2, policy_3, policy_4, LEFT, RIGHT, UP, DOWN
import numpy as np

In [5]:
policies = [policy_1, policy_2, policy_3, policy_4]

In [6]:
number_of_states = 12

In [8]:
def get_discounted_return(r, gamma=0.9):
    # Por si es una lista
    r = np.array(r, dtype=float)
    """Take 1D float array of rewards and compute discounted reward """
    discounted_r = np.zeros_like(r)
    running_add = 0
    for t in reversed(range(0, r.size)):
        running_add = running_add * gamma + r[t]
        discounted_r[t] = running_add
    return discounted_r[0]

def run_episode(env, policy, start_pos, gamma=1.0):
    obs = env.reset(start_pos)
    done = False
    rewards = []
    while not done:
        action = policy[obs]
        obs, reward, done, info = env.step(action)
        rewards.append(reward)
    return get_discounted_return(rewards, gamma)

def get_expected_return(env, policy, N=5000, start_pos=0, gamma=1.0):
    rewards = []
    for i in range(N):
        # Implementar
        reward = run_episode(env, policy, start_pos=start_pos, gamma=gamma)
        rewards.append(reward)

    return rewards, np.mean(rewards), np.std(rewards)

# Ejercicio 1: Muestreo en entorno aleatorio

In [9]:
step_penalty = 0
gamma = 1.0
is_slippery = True
env = get_frozenlake_env(is_slippery, step_penalty=step_penalty)
policy = policy_1

### Armar una función que devuelva la estimación de la V(s)
Recibe:
- El entorno (env)
- La política (policy)
- La cantidad de episodios usados para la estimación

Devuelve:
- numpy array de longitud 12 con los "value" donde la posición indica el estado 

In [11]:
def estimate_V_sampling(env, policy, N=10_000, gamma=1.0):
    Vs_sample = np.zeros(number_of_states)
    for start_pos in range(number_of_states):
        if start_pos in policy:
            # Implementar
            _, r_mean, r_std = get_expected_return(env, policy=policy, N=N, start_pos=start_pos, gamma=gamma)
            Vs_sample[start_pos] = r_mean
    return Vs_sample

In [12]:
N = 1_000
for i, policy in enumerate(policies):
    Vs_sample_policy_1 = estimate_V_sampling(env, policy, N=N, gamma=gamma)
    print('V(s) para policy', i + 1)
    print(Vs_sample_policy_1.reshape(3, 4))
    print()

V(s) para policy 1
[[0.819 1.    1.    1.   ]
 [0.595 1.    1.    1.   ]
 [0.    1.    1.    0.   ]]

V(s) para policy 2
[[0.535 0.684 0.786 0.864]
 [0.405 0.701 0.829 0.921]
 [0.    0.79  0.893 0.   ]]

V(s) para policy 3
[[0.594 0.751 0.819 0.866]
 [0.421 0.666 0.816 0.887]
 [0.    0.774 0.907 0.   ]]

V(s) para policy 4
[[0.    0.    0.    0.196]
 [0.    0.    0.    0.403]
 [0.    0.    0.    0.   ]]



# Ejercicio 2: Armar modelos de entorno y recompenza

### Para el caso del entorno esto sería un diccionario: 
- con keys de todos los estados posibles (de 0 a 11)
- para cada estado un diccionario con keys de las acciones posibles (LEFT, RIGHT, UP, DOWN)
- para cada estado y acción un diccionario que indique el listado de las proximas acciones con sus probabilidades

### Para el caso del modelo de recompenza sería: 

Igual al anterior solo que el ultimo diccionario contiene los proximos estados con la recompenza de cada uno.

En este caso para simplificar el código se guarda:
- count: cantidad de veces que entro en ese estado para calcular el reward
- total_reward: la suma de los rewards
- reward: El que nos interesa (total_reward/count)

In [13]:
action_to_str = {
    LEFT: 'LEFT',
    RIGHT: 'RIGHT',
    UP: 'UP',
    DOWN: 'DOWN'
}
str_to_action = {
    'LEFT': LEFT,
    'RIGHT': RIGHT,
    'UP': UP,
    'DOWN': DOWN
}

In [16]:
actions = [LEFT, RIGHT, UP, DOWN]

N = 500
transition_model = {}
reward_model = {}

for start_pos in range(number_of_states):
    # Iteración en todos los estados
    if start_pos not in transition_model:
        # Inicializo diccionario del estado
        transition_model[start_pos] = {}
        reward_model[start_pos] = {}
    for action in actions:
        # Iteración en todas las acciones
        action_str = action_to_str[action]
        if action not in transition_model[start_pos]:
            # inicializo diccinario de la accion
            transition_model[start_pos][action_str] = {}
            reward_model[start_pos][action_str] = {}
        for n in range(N):
            # Notar que no corro todo el episodio sino que solo me interesa la próxima acción
            env.reset(start_pos)
            obs, reward, done, info = env.step(action)

            if obs not in transition_model[start_pos][action_str]:
                # inicializo diccionario de proximo estado
                transition_model[start_pos][action_str][obs] = 0
                reward_model[start_pos][action_str][obs] = {}
                reward_model[start_pos][action_str][obs]['total_reward'] = 0
                reward_model[start_pos][action_str][obs]['count'] = 0
                reward_model[start_pos][action_str][obs]['reward'] = 0

            # Implementar
            transition_model[start_pos][action_str][obs] = transition_model[start_pos][action_str][obs] + 1
            reward_model[start_pos][action_str][obs]['total_reward'] = reward_model[start_pos][action_str][obs]['total_reward'] + reward
            reward_model[start_pos][action_str][obs]['count'] = reward_model[start_pos][action_str][obs]['count'] + 1
            reward_model[start_pos][action_str][obs]['reward'] = reward_model[start_pos][action_str][obs]['total_reward'] / reward_model[start_pos][action_str][obs]['count']

# Normalización de modelo
for state, actions in transition_model.items():
    for action, next_state_count in actions.items():
        total_count = 0
        for next_state, count in next_state_count.items():
            total_count = total_count + count
        for next_state, count in next_state_count.items():
            next_state_count[next_state] = count/total_count

### Depende el modelo del entorno de la policy?

In [17]:
transition_model[0]

{'DOWN': {0: 0.364, 1: 0.324, 4: 0.312},
 'LEFT': {0: 0.688, 4: 0.312},
 'RIGHT': {0: 0.308, 1: 0.32, 4: 0.372},
 'UP': {0: 0.592, 1: 0.408}}

In [18]:
reward_model[10]

{'DOWN': {9: {'count': 194, 'reward': 0.0, 'total_reward': 0},
  10: {'count': 168, 'reward': 0.0, 'total_reward': 0},
  11: {'count': 138, 'reward': 1.0, 'total_reward': 138.0}},
 'LEFT': {6: {'count': 161, 'reward': 0.0, 'total_reward': 0},
  9: {'count': 158, 'reward': 0.0, 'total_reward': 0},
  10: {'count': 181, 'reward': 0.0, 'total_reward': 0}},
 'RIGHT': {6: {'count': 158, 'reward': 0.0, 'total_reward': 0},
  10: {'count': 163, 'reward': 0.0, 'total_reward': 0},
  11: {'count': 179, 'reward': 1.0, 'total_reward': 179.0}},
 'UP': {6: {'count': 178, 'reward': 0.0, 'total_reward': 0},
  9: {'count': 169, 'reward': 0.0, 'total_reward': 0},
  11: {'count': 153, 'reward': 1.0, 'total_reward': 153.0}}}

In [19]:
reward_model[7]

{'DOWN': {6: {'count': 194, 'reward': 0.0, 'total_reward': 0},
  7: {'count': 154, 'reward': 0.0, 'total_reward': 0},
  11: {'count': 152, 'reward': 1.0, 'total_reward': 152.0}},
 'LEFT': {3: {'count': 174, 'reward': 0.0, 'total_reward': 0},
  6: {'count': 164, 'reward': 0.0, 'total_reward': 0},
  11: {'count': 162, 'reward': 1.0, 'total_reward': 162.0}},
 'RIGHT': {3: {'count': 176, 'reward': 0.0, 'total_reward': 0},
  7: {'count': 163, 'reward': 0.0, 'total_reward': 0},
  11: {'count': 161, 'reward': 1.0, 'total_reward': 161.0}},
 'UP': {3: {'count': 155, 'reward': 0.0, 'total_reward': 0},
  6: {'count': 162, 'reward': 0.0, 'total_reward': 0},
  7: {'count': 183, 'reward': 0.0, 'total_reward': 0}}}

# Ejercicio 3: Value iteration

In [20]:
def estimate_V_by_value_iteration(policy, transition_model, reward_model, N=500, number_of_states=12):
    Vs = np.zeros(number_of_states)
    for i in range(N):
        for s, v in enumerate(Vs):
            if s in policy:
                action = action_to_str[policy[s]]
                avg_reward = 0
                for next_s, prob in transition_model[s][action].items():
                    # Implementar
                    reward = reward_model[s][action][next_s]['reward']
                    avg_reward = avg_reward + (Vs[next_s] + reward)*prob
                Vs[s] = avg_reward
    return Vs

In [21]:
N = 10
policy = policy_1
Vs = estimate_V_by_value_iteration(policy, transition_model, reward_model, N=N)

In [22]:
Vs.reshape(3, 4)

array([[0.40554616, 0.65460558, 0.74970397, 0.78866147],
       [0.35327082, 0.73563181, 0.84491125, 0.90385623],
       [0.        , 0.79002096, 0.91885642, 0.        ]])

In [23]:
N = 200
Vs_sample_policy = estimate_V_sampling(env, policy, N=N, gamma=gamma)
Vs_sample_policy.reshape(3, 4)

array([[0.79, 1.  , 1.  , 1.  ],
       [0.63, 1.  , 1.  , 1.  ],
       [0.  , 1.  , 1.  , 0.  ]])