## Práctico 5: Volcano Crossing

In [1]:
from volcano_crossing_env import VolcanoCrossing

env = VolcanoCrossing(slip_prob=0.5)

In [2]:
env.reset()

'21'

### Jugar manualmente

In [3]:
def run_game():
        print("Play manually...")
        obs = env.reset()
        print(obs)
        done = False
        step_counter = 0
        all_rewards = 0
        env.render()

        while not done:
            action = input("Next action: ")
            env.check_action(action)
            obs, reward, done_env, _ = env.step(action)
            print(f'{obs=} {reward=} {done_env=}')
            all_rewards += reward
            done = done_env
            env.render()
            step_counter += 1

In [14]:
run_game()

Play manually...
21
state: 21 done: False
obs=np.str_('31') reward=2 done_env=True
state: 31 done: True


### Jugar con una policy

In [17]:
def policy_south(state) :
  return 'S'

def policy_north(state) :
  return 'N'

def policy_east(state) :
  return 'E'

def policy_west(state) :
  return 'W'

In [6]:
def run(policy) :
  U = 0
  done = False
  state = env.reset()
  while not done:
    state, reward, done, _ = env.step(policy(state))
    U += reward
  return U

In [20]:
run(policy_west)

2

### Estimación de la policy por promedio

In [21]:
r = 0
N = 50_000
for i in range(N):
    r += run(policy_north)

print (f'Average reward: {r/N}')

Average reward: -28.09456


### Policy Evaluation

In [9]:
def policy_evaluation(policy, delta = 0.05, gamma = 0.999):
  V = { '11':0, '12':0, '13':0, '14':0, '21':0, '22':0, '23':0, '24':0, '31':0, '32':0, '33':0, '34':0 }
  V_new = { '11':0, '12':0, '13':0, '14':0, '21':0, '22':0, '23':0, '24':0, '31':0, '32':0, '33':0, '34':0 }
  max_diff = delta + 1

  while max_diff > delta:
    for s in env.observation_space:
      a = policy(s)
      q = 0
      for s_prime in env.P[s][a].keys():
        q += env.P[s][a][s_prime] * (env.R[s][a][s_prime] + gamma * V[s_prime])
      V_new[s] = q
    max_diff = 0
    for s in V.keys():
      max_diff = max(max_diff, abs(V[s]-V_new[s]))
    V = V_new.copy()
  return V

In [10]:
V = policy_evaluation(policy_south, delta = 0.05, gamma = 0.999)
print(V)

{'11': -10.097479836569216, '12': -25.02876509699797, '13': 0.0, '14': 0.0, '21': -5.153966059429463, '22': -21.73443636193771, '23': 0.0, '24': -21.539534697673236, '31': 0.0, '32': -16.790727065526983, '33': -30.83304348111256, '34': -25.988180160178665}


In [22]:
V = policy_evaluation(policy_north, delta = 0.05, gamma = 0.999)
print(V)

{'11': -33.67385097676337, '12': -40.458592488691465, '13': 0.0, '14': 0.0, '21': -27.34202796251972, '22': -38.05642247142104, '23': 0.0, '24': 0.13372792866082328, '31': 0.0, '32': -29.944176341195977, '33': -37.83687972706362, '34': -9.342616647035367}


In [23]:
V = policy_evaluation(policy_east, delta = 0.05, gamma = 0.999)
print(V)

{'11': -43.9022649270958, '12': -47.30236602463816, '13': 0.0, '14': 0.0, '21': -33.987366121400456, '22': -42.757085499555195, '23': 0.0, '24': -17.621993521940997, '31': 0.0, '32': -25.39581557126935, '33': -28.84154928530964, '34': -23.04484298388258}


In [24]:
V = policy_evaluation(policy_west, delta = 0.05, gamma = 0.999)
print(V)

{'11': -17.897256693292853, '12': -24.525782855383536, '13': 0.0, '14': 0.0, '21': -11.63441817939945, '22': -19.239905864923973, '23': 0.0, '24': -30.173214438394798, '31': 0.0, '32': -6.210382176516225, '33': -17.893058456991003, '34': -20.924056781832736}


### Policy Improvement

In [15]:
from collections import defaultdict


policies = defaultdict(lambda: 'S')

In [None]:
def policy_improvement(policy, delta = 0.05, gamma=0.99):
    
    U = policy_evaluation(policy, delta, gamma)
    
    policy_prime = policy
    
    for s in env.observation_space:
        best_action = None
        best_value = float('-inf')
        
        for a in env.action_space:
            q = 0
            for s_prime in env.P[s][a].keys():
                q += env.P[s][a][s_prime] * (env.R[s][a][s_prime] + gamma * U[s_prime])
            if q > best_value:
                best_value = q
                best_action = a
                policy_prime[s] = best_action
                
    return policy_prime
        

### Policy Iteration

In [None]:
# Realizar policy iteration y ver cuál es la mejor acción para cada estado

### Value Iteration

1. Implementar el algoritmo de Value Iteration para encontrar la policy óptima.
2. Obtener la utilidad esperada de esa policy.
3. Graficar el valor del estado "21" para cada iteración en Value Iteration y Policy Iteration (con una policy inicial arbitraria).

In [None]:
def value_iteration(mdp, delta_threshold=0.05, gamma=0.999):
    pass

def q_value(mdp, state, action, V, gamma):
    pass