<a href="https://colab.research.google.com/github/LeelaNandhaKishore1511/RL-Lab-Sem-5/blob/main/RL_EXPT_8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
import numpy as np
import gymnasium as gym

# -------------------------------
# Create environment
# -------------------------------
env = gym.make("FrozenLake-v1", is_slippery=False)
unwrapped_env = env.unwrapped


print("Number of states:", unwrapped_env.observation_space.n)
print("Number of actions:", unwrapped_env.action_space.n)

# Parameters
gamma = 0.9
theta = 1e-6

# -------------------------------
# Value Iteration
# -------------------------------
def value_iteration(env, gamma=0.9, theta=1e-6):
    value_table = np.zeros(env.observation_space.n)

    while True:
        delta = 0
        for state in range(env.observation_space.n):
            action_values = np.zeros(env.action_space.n)
            for action in range(env.action_space.n):
                for prob, next_state, reward, done in env.P[state][action]:
                    action_values[action] += prob * (reward + gamma * value_table[next_state])
            best_action_value = np.max(action_values)
            delta = max(delta, abs(value_table[state] - best_action_value))
            value_table[state] = best_action_value
        if delta < theta:
            break

    # Derive optimal policy
    policy = np.zeros(env.observation_space.n, dtype=int)
    for state in range(env.observation_space.n):
        action_values = np.zeros(env.action_space.n)
        for action in range(env.action_space.n):
            for prob, next_state, reward, done in env.P[state][action]:
                action_values[action] += prob * (reward + gamma * value_table[next_state])
        policy[state] = np.argmax(action_values)

    return policy, value_table

# -------------------------------
# Policy Evaluation
# -------------------------------
def policy_evaluation(policy, env, gamma=0.9, theta=1e-6):
    value_table = np.zeros(env.observation_space.n)
    while True:
        delta = 0
        for state in range(env.observation_space.n):
            v = 0
            action = policy[state]
            for prob, next_state, reward, done in env.P[state][action]:
                v += prob * (reward + gamma * value_table[next_state])
            delta = max(delta, abs(value_table[state] - v))
            value_table[state] = v
        if delta < theta:
            break
    return value_table

# -------------------------------
# Policy Improvement
# -------------------------------
def policy_improvement(value_table, policy, env, gamma=0.9):
    policy_stable = True
    for state in range(env.observation_space.n):
        old_action = policy[state]
        action_values = np.zeros(env.action_space.n)
        for action in range(env.action_space.n):
            for prob, next_state, reward, done in env.P[state][action]:
                action_values[action] += prob * (reward + gamma * value_table[next_state])
        policy[state] = np.argmax(action_values)
        if old_action != policy[state]:
            policy_stable = False
    return policy, policy_stable

# -------------------------------
# Policy Iteration
# -------------------------------
def policy_iteration(env, gamma=0.9, theta=1e-6):
    policy = np.random.choice(env.action_space.n, size=env.observation_space.n)
    value_table = np.zeros(env.observation_space.n)

    while True:
        value_table = policy_evaluation(policy, env, gamma, theta)
        policy, policy_stable = policy_improvement(value_table, policy, env, gamma)
        if policy_stable:
            return policy, value_table

# -------------------------------
# Run Both Algorithms
# -------------------------------
print("\n Running Policy Iteration...")
pi_policy, pi_value = policy_iteration(unwrapped_env, gamma, theta)

print("\nOptimal Value Function (Policy Iteration):")
print(pi_value.reshape(4, 4))
print("\nOptimal Policy (Policy Iteration):")
print(pi_policy.reshape(4, 4))

print("\n Running Value Iteration...")
vi_policy, vi_value = value_iteration(unwrapped_env, gamma, theta)

print("\nOptimal Value Function (Value Iteration):")
print(vi_value.reshape(4, 4))
print("\nOptimal Policy (Value Iteration):")
print(vi_policy.reshape(4, 4))

# Check if both results match
print("\n Do both methods give same optimal policy? ->", np.array_equal(pi_policy, vi_policy))

Number of states: 16
Number of actions: 4

 Running Policy Iteration...

Optimal Value Function (Policy Iteration):
[[0.59049 0.6561  0.729   0.6561 ]
 [0.6561  0.      0.81    0.     ]
 [0.729   0.81    0.9     0.     ]
 [0.      0.9     1.      0.     ]]

Optimal Policy (Policy Iteration):
[[1 2 1 0]
 [1 0 1 0]
 [2 1 1 0]
 [0 2 2 0]]

 Running Value Iteration...

Optimal Value Function (Value Iteration):
[[0.59049 0.6561  0.729   0.6561 ]
 [0.6561  0.      0.81    0.     ]
 [0.729   0.81    0.9     0.     ]
 [0.      0.9     1.      0.     ]]

Optimal Policy (Value Iteration):
[[1 2 1 0]
 [1 0 1 0]
 [2 1 1 0]
 [0 2 2 0]]

 Do both methods give same optimal policy? -> True
