In [None]:
"""
    Policy_Iteration learning algorithm.  
    More details you can learn from the blog:
        https://blog.csdn.net/njshaka/article/details/892379
    
    author: Xinchen Han
    data: 2020/7/25
"""

In [None]:
import gym
import numpy as np

env = gym.make('FrozenLake-v0')

eon = env.observation_space.n

ean = env.action_space.n



def compute_value_function(policy, gamma=1.0):
    value_table = np.zeros(eon)
    threshold = 1e-10
    while True:
        updated_value_table = np.copy(value_table)
        for state in range(eon):
            action = policy[state]
            value_table[state] = sum([trans_prob*(reward+gamma*updated_value_table[next_state])
                                      for trans_prob, next_state, reward, done in env.P[state][action]])
        if (np.sum((np.fabs(updated_value_table-value_table))) <= threshold):
            break
    return value_table



def extract_policy(value_table, gamma=1.0):
    policy = np.zeros(eon)
    for state in range(eon):
        Q_table = np.zeros(ean)
        for action in range(ean):
            for next_sr in env.P[state][action]:
                trans_prob, next_state, reward, done = next_sr
                Q_table[action] += (trans_prob *
                                    (reward+gamma*value_table[next_state]))
        policy[state] = np.argmax(Q_table)
    return policy



def policy_iteration(env, gamma=1.0):
    random_policy = np.zeros(eon)
    no_of_iterations = 200000
    for i in range(no_of_iterations):
        new_value_function = compute_value_function(random_policy, gamma)
        new_policy = extract_policy(new_value_function, gamma)
        if (np.all(random_policy == new_policy)):
            print('Policy-Iteration converged as step %d.' % (i+1))
            break
        random_policy = new_policy
    return new_policy


print(policy_iteration(env))
