In [7]:
import numpy as np
import pandas as pd
from gridworld import GridworldEnv

In [6]:
env = GridworldEnv()

In [None]:
"""
    Based model, i.e., the probability transition of states and reward functions are given.
    In this case(dynamic programing), policy evaluation is actually to calculate the state value function 
    based on the states tabular.
    So first, we should build the states tabular
"""

In [None]:
#iterative policy evaluation, for estimating V to get close to the optimal state value functioin
def policy_eval(policy, env, discount_factor=1.0, threhold=0.00001):
    """
    Args:
        policy:
            state value functions we need to calculate and its type should be a vector
            
        env: env.P represents the transition probabilities of the environment.
             env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).
             env.nS is a number of states in the environment.
             env.nA is the number of actions in the environment.
             
        threhold: We stop evaluation once our value function change is less than threhold for all states.
    
    Returns:
        Vector of length env.nS representing the value funciton
    """
    
    #initialize V(s) arbitrarily
    Vs = np.zeros(env.nS)
    
    while True:
        delta = 0 #the difference of the old value and new value, break the loop when it's smaller than theta
        
        for s in range(env.nS):
            v = Vs[s]
            for a, action_prob in enumerate(policy[s]):
                #for each action, look at the possible next states...
                for prob, next_state, reward, done in env.P[s][a]:
                    v += action_prob * prob * (reward + discount_factor * V[next_state])
            delta = max(delta, np.abs(v - V[s]))
            V[s] = v
            
        if delta < theta:
            break
    return np.array(V)