# Santhosh Prabhu
# 220968025

In [9]:
import gymnasium as gym
import numpy as np
from gymnasium.envs.toy_text.frozen_lake import FrozenLakeEnv


In [10]:
def policy_iteration(strategy, environment, gamma=0.99, threshold=1e-8, max_iters=1000):

    if hasattr(environment, 'unwrapped'):
        environment = environment.unwrapped

    num_states = environment.observation_space.n
    num_actions = environment.action_space.n
    value_func = np.zeros(num_states)
    
    for _ in range(max_iters):
        while True:
            delta = 0
            for state in range(num_states):
                old_value = value_func[state]
                new_value = 0
                for action, action_prob in enumerate(strategy[state]):
                    for prob, next_state, reward, done in environment.P[state][action]:
                        new_value += action_prob * prob * (reward + gamma * value_func[next_state])
                value_func[state] = new_value
                delta = max(delta, abs(old_value - new_value))
            if delta < threshold:
                break

        stable = True
        for state in range(num_states):
            old_action = np.argmax(strategy[state])
            action_values = np.zeros(num_actions)
            for action in range(num_actions):
                for prob, next_state, reward, done in environment.P[state][action]:
                    action_values[action] += prob * (reward + gamma * value_func[next_state])
            best_action = np.argmax(action_values)
            new_strategy = np.eye(num_actions)[best_action]
            if not np.array_equal(new_strategy, strategy[state]):
                stable = False
            strategy[state] = new_strategy
        
        if stable:
            break

    return value_func, strategy


In [11]:
def value_iteration(environment, gamma=0.99, threshold=1e-8, max_iters=1000):

    if hasattr(environment, 'unwrapped'):
        environment = environment.unwrapped

    num_states = environment.observation_space.n
    num_actions = environment.action_space.n
    value_func = np.zeros(num_states)

    for _ in range(max_iters):
        delta = 0
        for state in range(num_states):
            old_value = value_func[state]
            action_vals = np.zeros(num_actions)
            for action in range(num_actions):
                for prob, next_state, reward, done in environment.P[state][action]:
                    action_vals[action] += prob * (reward + gamma * value_func[next_state])
            value_func[state] = np.max(action_vals)
            delta = max(delta, abs(old_value - value_func[state]))
        if delta < threshold:
            break

    strategy = np.zeros((num_states, num_actions))
    for state in range(num_states):
        action_vals = np.zeros(num_actions)
        for action in range(num_actions):
            for prob, next_state, reward, done in environment.P[state][action]:
                action_vals[action] += prob * (reward + gamma * value_func[next_state])
        best_action = np.argmax(action_vals)
        strategy[state] = np.eye(num_actions)[best_action]

    return value_func, strategy


In [12]:
def assess_strategy(environment, strategy, episodes=1000):

    successes = 0
    total_rewards = 0
    for _ in range(episodes):
        state, _ = environment.reset()
        done = False
        episode_reward = 0
        while not done:
            action = np.random.choice(np.arange(environment.action_space.n), p=strategy[state])
            state, reward, done, _, _ = environment.step(action)
            episode_reward += reward
        if reward > 0:
            successes += 1
        total_rewards += episode_reward
    return successes, total_rewards / episodes


In [13]:
if __name__ == "__main__":
    # Initialize environment
    env = gym.make('FrozenLake-v1', map_name="4x4", is_slippery=True)
    num_states = env.observation_space.n
    num_actions = env.action_space.n

    # Initial strategy: equal probability for all actions
    init_strategy = np.ones((num_states, num_actions)) / num_actions
    gamma = 0.99
    threshold = 1e-8
    max_iters = 1000

    # Apply Policy Iteration
    values_pi, strategy_pi = policy_iteration(init_strategy.copy(), env, gamma, threshold, max_iters)
    wins_pi, avg_return_pi = assess_strategy(env, strategy_pi, episodes=1000)
    print("Policy Iteration Results:")
    print(f"Wins: {wins_pi} / 1000 episodes")
    print(f"Average Return: {avg_return_pi:.3f}")

    # Apply Value Iteration
    values_vi, strategy_vi = value_iteration(env, gamma, threshold, max_iters)
    wins_vi, avg_return_vi = assess_strategy(env, strategy_vi, episodes=1000)
    print("\nValue Iteration Results:")
    print(f"Wins: {wins_vi} / 1000 episodes")
    print(f"Average Return: {avg_return_vi:.3f}")


Policy Iteration Results:
Wins: 821 / 1000 episodes
Average Return: 0.821

Value Iteration Results:
Wins: 808 / 1000 episodes
Average Return: 0.808


While both methods performed well, Policy Iteration outperformed Value Iteration in this experiment, making it the preferred choice for solving this FrozenLake-v1 problem. However, in environments with a large state space, Value Iteration may be preferred due to its computational efficiency.