In [1]:
import gym
import numpy as np
import matplotlib.pyplot as plt

np.set_printoptions(suppress=True)

In [5]:
gamma = 0.99

def policy_evaluation(env, pi, gamma=0.99, eps=1e-5):
    n_states = env.observation_space.n
    
    V = np.ones(n_states)
    V[n_states-1] = 0
    delta = 1
    
    while delta > eps:
        delta = 0
        for s in range(n_states):
            v = V[s]
            a = pi[s]
            
            v_sum = 0
            for next_s in range(n_states):
#                 print(s, next_s, P[next_s, s])
                v_sum += P[next_s, s, a]*(0 + gamma*V[next_s])  
            
            V[s] = v_sum
#             print(s, V[s])
            delta = max(delta, abs(v - V[s]))
#         break
            
    return V
    
def policy_improvement(env, pi, V):
    n_states = env.observation_space.n
    n_actions = env.action_space.n
    
    stable = True
    
    for s in range(n_states):
        a_old = pi[s]
        
        action_values = np.zeros(n_actions)
        for a in range(n_actions):
            
            action_value = 0
            for next_s in range(n_states):
                action_value += P[next_s, s, a]*(0 + gamma*V[next_s])
            
            action_values[a] = action_value
            
        pi[s] = np.argmax(action_values)
        if pi[s] != a_old:
            stable = False
            
    return pi, stable
        
        
env = gym.make('FrozenLake-v0', is_slippery=False)

# print(pi)




def get_valid_actions(s1, s2):
    
    valid_actions = np.zeros(n_actions)
    
    #left
    if s1-s2==1 and s1%4 > 0:
        valid_actions[0] = 1
    
    #down
    if s1-s2==-4:
        valid_actions[1] = 1
    
    # right
    if s1-s2==-1 and s2%4 < 3:
        valid_actions[2] = 1
        
    # up
    if s1-s2==4:
        valid_actions[3] = 1
        
#     if s1==s2:
#         # left
#         # down
#         # right
#         if s1%4==3:
#             valid_actions[3] = 1
#         # up
#         if s1<4:
#             valid_actions[3] = 1
        
    return valid_actions

n_actions = env.action_space.n
n_states = env.observation_space.n
P = np.zeros(shape=(n_states, n_states, n_actions))

for s1 in range(n_states):
    for s2 in range(n_states):
        P[s2, s1] = get_valid_actions(s1, s2)

def policy_iteration(env):

    pi = np.random.randint(n_actions, size=n_states)
    pi = [2,2,1,0,0,0,1,0,0,0,1,0,0,0,2,0]
    print(pi)
    stable = False
    
    while not stable: 
        V = policy_evaluation(env, pi)
        print('V')
        pi, stable = policy_improvement(env, pi, V)
        print(pi)
        break
        
policy_iteration(env)
# P[s',s,a] = P(s'|s,a)
# Note that the transition matrix is not needed, since transitions are deterministic.

[2, 2, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 2, 0]
f [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [35]:
def pi(state):
    
    return np.random.randint(4)

def evaluate_policy_TD(env, pi, num_episodes, alpha, gamma):
    
    V = np.zeros(env.observation_space.n)
    

    for i in range(num_episodes):
    
        done = False
        state = env.reset()
        
        while not done:

            action = pi(state)
            next_state, reward, done, info = env.step(action)
            V[state] += alpha*(reward + gamma*V[next_state] - V[state])
            state = next_state
            
    return V


def evaluate_policy_MC(env, pi, num_episodes, alpha, gamma):
    
    V = np.zeros(env.observation_space.n)
    states  = []
    rewards = []

    for i in range(num_episodes):
    
        done = False
        rewards.clear()
        state = env.reset()
        states.append(state)

        while not done:
            
            action = pi(state)
            next_state, reward, done, info = env.step(action)
            
            states.append(next_state)
            rewards.append(reward)
            
            state = next_state
            
        reward_sum = 0
        
        for state in states:
            for i in range(len(rewards)):
                reward_sum += gamma**i * rewards[i]

            V[state] = reward_sum
            

    return V
        
    
    


In [36]:
env = gym.make('FrozenLake-v0')
evaluate_policy_MC(env, pi, 1000, 1, 1)

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [111]:
# Naive implementation (for loops are slow), but matches the box
def policy_iter(env, gamma, theta):
    """Policy Iteration Algorithm
    
    Params:
        env - environment with following required memebers:
            env.nb_states - number of states
            env.nb_action - number of actions
            env.model     - prob-transitions and rewards for all states and actions, see note #1
        gamma (float) - discount factor
        theta (float) - termination condition
    """
    
    # 1. Initialization
    V = np.ones(env.nb_states)
    pi = np.zeros(env.nb_states, dtype=int)  # greedy, always pick action 0
    
    while True:
    
        # 2. Policy Evaluation
        while True:
            delta = 0
            for s in range(env.nb_states):
                v = V[s]
                V[s] = sum_sr(env, V=V, s=s, a=pi[s], gamma=gamma)
                delta = max(delta, abs(v - V[s]))
            if delta < theta: break

        # 3. Policy Improvement
        policy_stable = True
        for s in range(env.nb_states):
            old_action = pi[s]
            pi[s] = np.argmax([sum_sr(env, V=V, s=s, a=a, gamma=gamma)  # list comprehension
                               for a in range(env.nb_actions)])
            if old_action != pi[s]: policy_stable = False
        if policy_stable: break
    
    return V, pi

def sum_sr(env, V, s, a, gamma):
    """Calc state-action value for state 's' and action 'a'"""
    tmp = 0  # state value for state s
    for p, s_, r, _ in env.model[s][a]:     # see note #1 !
        # p  - transition probability from (s,a) to (s')
        # s_ - next state (s')
        # r  - reward on transition from (s,a) to (s')
        tmp += p * (r + gamma * V[s_])
    return tmp

In [112]:
env = gym.make('FrozenLake-v0')
env.reset()
env.render()

if not hasattr(env, 'nb_states'):  env.nb_states = env.env.nS
if not hasattr(env, 'nb_actions'): env.nb_actions = env.env.nA
if not hasattr(env, 'model'):      env.model = env.env.P

V, pi = policy_iter(env, gamma=1.0, theta=1e-8)
print(V.reshape([4, -1]))


[41mS[0mFFF
FHFH
FFFH
HFFG
[[0.82352925 0.82352919 0.82352915 0.82352913]
 [0.82352926 0.         0.52941165 0.        ]
 [0.82352929 0.82352932 0.7647058  0.        ]
 [0.         0.88235288 0.94117644 0.        ]]
