<a href="https://colab.research.google.com/github/Ilaharshith/Reinforcement-Learning-/blob/main/LAB_02.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import gymnasium as gym
import numpy as np
env = gym.make("FrozenLake-v1", is_slippery=False)

n_states = env.observation_space.n
n_actions = env.action_space.n
gamma = 0.99
theta = 1e-8


def value_iteration(env):
    V = np.zeros(n_states)
    while True:
        delta = 0
        for s in range(n_states):
            q_values = []
            for a in range(n_actions):
                q = 0
                for prob, next_state, reward, done in env.unwrapped.P[s][a]:
                    q += prob * (reward + gamma * V[next_state])
                q_values.append(q)
            max_q = max(q_values)
            delta = max(delta, abs(V[s] - max_q))
            V[s] = max_q
        if delta < theta:
            break

    policy = np.zeros(n_states, dtype=int)
    for s in range(n_states):
        q_values = []
        for a in range(n_actions):
            q = 0
            for prob, next_state, reward, done in env.unwrapped.P[s][a]:
                q += prob * (reward + gamma * V[next_state])
            q_values.append(q)
        policy[s] = np.argmax(q_values)
    return policy, V


def policy_iteration(env):
    policy = np.zeros(n_states, dtype=int)
    V = np.zeros(n_states)

    while True:
        while True:
            delta = 0
            for s in range(n_states):
                v = 0
                a = policy[s]
                for prob, next_state, reward, done in env.unwrapped.P[s][a]:
                    v += prob * (reward + gamma * V[next_state])
                delta = max(delta, abs(V[s] - v))
                V[s] = v
            if delta < theta:
                break

        policy_stable = True
        for s in range(n_states):
            old_action = policy[s]
            q_values = []
            for a in range(n_actions):
                q = 0
                for prob, next_state, reward, done in env.unwrapped.P[s][a]:
                    q += prob * (reward + gamma * V[next_state])
                q_values.append(q)
            new_action = np.argmax(q_values)
            policy[s] = new_action
            if old_action != new_action:
                policy_stable = False

        if policy_stable:
            break

    return policy, V


def run_policy(env, policy, render=False):
    total_rewards = 0
    state, _ = env.reset()
    while True:
        if render:
            env.render()
        action = policy[state]
        state, reward, terminated, truncated, _ = env.step(action)
        total_rewards += reward
        if terminated or truncated:
            break
    return total_rewards


vi_policy, vi_V = value_iteration(env)
vi_rewards = [run_policy(env, vi_policy) for _ in range(100)]
print("Value Iteration - Average Reward over 100 episodes:", np.mean(vi_rewards))

pi_policy, pi_V = policy_iteration(env)
pi_rewards = [run_policy(env, pi_policy) for _ in range(100)]
print("Policy Iteration - Average Reward over 100 episodes:", np.mean(pi_rewards))

Value Iteration - Average Reward over 100 episodes: 1.0
Policy Iteration - Average Reward over 100 episodes: 1.0


In [None]:
import numpy as np

transitions = {
    0: {
        0: [(1.0, 1, 1)],
        1: [(1.0, 2, 0)],
    },
    1: {
        0: [(1.0, 3, 5)],
        1: [(1.0, 0, 0)],
    },
    2: {
        0: [(1.0, 1, 1)],
        1: [(1.0, 3, 2)],
    },
    3: {
        0: [(1.0, 3, 0)],
        1: [(1.0, 3, 0)],
    }
}

states = [0, 1, 2, 3]
actions = [0, 1]
gamma = 0.9
theta = 1e-5

def value_iteration():
    V = np.zeros(len(states))
    policy = np.zeros(len(states), dtype=int)

    while True:
        delta = 0
        for s in states:
            action_values = []
            for a in actions:
                value = 0
                for prob, next_state, reward in transitions[s][a]:
                    value += prob * (reward + gamma * V[next_state])
                action_values.append(value)
            best_value = max(action_values)
            delta = max(delta, abs(best_value - V[s]))
            V[s] = best_value
        if delta < theta:
            break

    for s in states:
        action_values = []
        for a in actions:
            value = 0
            for prob, next_state, reward in transitions[s][a]:
                value += prob * (reward + gamma * V[next_state])
            action_values.append(value)
        policy[s] = np.argmax(action_values)

    return V, policy

def policy_evaluation(policy, V):
    while True:
        delta = 0
        for s in states:
            a = policy[s]
            value = 0
            for prob, next_state, reward in transitions[s][a]:
                value += prob * (reward + gamma * V[next_state])
            delta = max(delta, abs(value - V[s]))
            V[s] = value
        if delta < theta:
            break
    return V

def policy_iteration():
    policy = np.zeros(len(states), dtype=int)
    V = np.zeros(len(states))

    while True:
        V = policy_evaluation(policy, V)
        policy_stable = True

        for s in states:
            old_action = policy[s]
            action_values = []
            for a in actions:
                value = 0
                for prob, next_state, reward in transitions[s][a]:
                    value += prob * (reward + gamma * V[next_state])
                action_values.append(value)
            best_action = np.argmax(action_values)
            policy[s] = best_action

            if old_action != best_action:
                policy_stable = False

        if policy_stable:
            break

    return V, policy

# Run both algorithms
V_vi, policy_vi = value_iteration()
V_pi, policy_pi = policy_iteration()

print("Value Iteration:\nValues:", V_vi, "\nPolicy:", policy_vi)
print("\nPolicy Iteration:\nValues:", V_pi, "\nPolicy:", policy_pi)

Value Iteration:
Values: [5.5 5.  5.5 0. ] 
Policy: [0 0 0 0]

Policy Iteration:
Values: [5.5 5.  5.5 0. ] 
Policy: [0 0 0 0]
