In [None]:
"""
    Value_Iteration learning algorithm.
       
    For each state s, for each action a, calculate the expected value of the next state s_.To see which action can reach 
the state with the largest expected value function, take this largest expected value function as the value function V(s)
and this step in a loop until the value function converges.
    
    Author：Xinchen Han
    date: 2020/7/24
"""
"""
    Description of the problem--Frozen Lake

        SFFF
        FHFH
        FFFH
        HFFG

    S : starting point, safe
    F : frozen surface, safe
    H : hole, fall to your doom
    G : goal, where the frisbee is located

    The episode ends when you reach the goal or fall in a hole.
    You receive a reward of 1 if you reach the goal, and zero otherwise.
    https://reinforcement-learning4.fun/2019/06/16/gym-tutorial-frozen-lake/
"""

In [None]:
import argparse
import os
import time

import gym
import matplotlib.pyplot as plt
import numpy as np



In [None]:
# Load the environment
alg_name = 'Value_Iteration'
env_id = 'FrozenLake-v0'
env = gym.make(env_id)
render = True  # display the game environment

# Initialize the V_table
V_table = np.zeros(env.observation_space.n)

# Set learning parameters
Max_Episodes = 1000
delta = 1e-20
t0 = time.time()


In [None]:
# Value_Iteration
def value_iteration(env, gamma = 1.0):
    for i in range(Max_Episodes):
        New_V_table = np.copy(V_table)
        for state in range(env.observation_space.n):
            Q_value = []
            for action in range(env.action_space.n):
                next_states_rewards = []
                for next_info in env.P[state][action]:
                    trans_prob, next_state, reward, done = next_info
                    next_states_rewards.append(
                        (trans_prob * (reward + gamma * New_V_table[next_state])))
                    Q_value.append(np.sum(next_states_rewards))
                    V_table[state] = max(Q_value)
        
        print("Value_table:", V_table)

        if(np.sum(np.fabs(New_V_table-V_table)) <= delta):
            print("Value-itration converged at itration # %d" % (i+1))
            break
    return V_table

In [None]:
def extract_policy(value_table, gamma=1.0):
    policy = np.zeros(env.observation_space.n)
    for state in range(env.observation_space.n):
        Q_table = np.zeros(env.action_space.n)
        for action in range(env.action_space.n):
            for next_sr in env.P[state][action]:
                trans_prob, next_state, reward, done = next_sr
                Q_table[action] = (trans_prob * (reward + gamma * value_table[next_state]))
        policy[state] = np.argmax(Q_table)
    return policy



optimal_value_function = value_iteration(env=env, gamma=0.95)

optimal_policy = extract_policy(optimal_value_function, gamma=0.95)

print(optimal_policy)
