<a href="https://colab.research.google.com/github/Mariihmp/RL_Notebooks/blob/main/policy_eval_Q_V.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import sys
import numpy as np
project_folder_path = '/content/drive/MyDrive/session-2'
sys.path.append(project_folder_path)
from GridWorld_env import GridWorld

In [None]:
from GridWorld_env import GridWorld

In [None]:
env = GridWorld()
def get_random_policy(num_states, num_actions):
    return np.ones((num_states, num_actions)) / num_actions

random_policy = get_random_policy(env.num_states, env.num_actions)
print("Random Policy (P(a|s)):")
print(random_policy)

Random Policy (P(a|s)):
[[0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]]


In [None]:
def policy_evaluation_v(policy, env, theta=1e-6):
    V = np.zeros(env.num_states)
    while True:
        delta = 0
        for s_idx in range(env.num_states):
            if env.idx_to_state[s_idx] == env.goal_state: # Value of goal state is 0
                continue
            v_old = V[s_idx]
            new_v = 0
            for a_idx in range(env.num_actions):
                prob_action = policy[s_idx, a_idx]
                next_s_idx, reward, done = env.step(s_idx, a_idx)
                new_v += prob_action * (reward + env.gamma * V[next_s_idx])
            V[s_idx] = new_v
            delta = max(delta, abs(v_old - V[s_idx]))
        if delta < theta:
            break
    return V



In [None]:
def calculate_q_from_v(V, env):
    Q = np.zeros((env.num_states, env.num_actions))
    for s_idx in range(env.num_states):
        if env.idx_to_state[s_idx] == env.goal_state:
            continue
        for a_idx in range(env.num_actions):
            next_s_idx, reward, done = env.step(s_idx, a_idx)
            Q[s_idx, a_idx] = reward + env.gamma * V[next_s_idx]
    return Q

print("\n--- Evaluating Random Policy ---")
V_random = policy_evaluation_v(random_policy, env)
Q_random = calculate_q_from_v(V_random, env)

print("V(s) for Random Policy:")
for s_idx, v_val in enumerate(V_random):
    print(f"  State {env.idx_to_state[s_idx]}: {v_val:.2f}")

print("\nQ(s,a) for Random Policy:")
for s_idx in range(env.num_states):
    if env.idx_to_state[s_idx] == env.goal_state:
        print(f"  State {env.idx_to_state[s_idx]} (Goal): Q values are N/A (or 0)")
        continue
    print(f"  State {env.idx_to_state[s_idx]}:")
    for a_idx, q_val in enumerate(Q_random[s_idx]):
        print(f"    Action {env.actions[a_idx]}: {q_val:.2f}")


--- Evaluating Random Policy ---
V(s) for Random Policy:
  State (0, 0): 1.18
  State (0, 1): 3.66
  State (1, 0): 3.66
  State (1, 1): 0.00

Q(s,a) for Random Policy:
  State (0, 0):
    Action U: 0.06
    Action D: 2.30
    Action L: 0.06
    Action R: 2.30
  State (0, 1):
    Action U: 2.30
    Action D: 10.00
    Action L: 0.06
    Action R: 2.30
  State (1, 0):
    Action U: 0.06
    Action D: 2.30
    Action L: 2.30
    Action R: 10.00
  State (1, 1) (Goal): Q values are N/A (or 0)


In [None]:
#Generalized policy improvement
def policy_improvement(V, env):
    new_policy = np.zeros((env.num_states, env.num_actions))
    for s_idx in range(env.num_states):
        if env.idx_to_state[s_idx] == env.goal_state:
            # For the goal state, any action is fine or no action is taken
            # For simplicity, we can make it uniform or point to a single action.
            # However, it doesn't affect the values of other states.
            # Let's make it uniform to avoid issues if it's somehow entered as a non-terminal.
            new_policy[s_idx, :] = 1.0 / env.num_actions
            continue

        q_values_s = np.zeros(env.num_actions)
        for a_idx in range(env.num_actions):
            next_s_idx, reward, done = env.step(s_idx, a_idx)
            q_values_s[a_idx] = reward + env.gamma * V[next_s_idx]

        best_action_idx = np.argmax(q_values_s)
        new_policy[s_idx, best_action_idx] = 1.0 # Greedy action
    return new_policy

In [None]:
def policy_iteration(env, initial_policy=None, theta=1e-6):
    if initial_policy is None:
        policy = get_random_policy(env.num_states, env.num_actions)
    else:
        policy = initial_policy.copy()

    iteration = 0
    while True:
        iteration += 1
        print(f"\n--- Policy Iteration: Iteration {iteration} ---")
        # 1. Policy Evaluation
        V = policy_evaluation_v(policy, env, theta)
        print("  V(s) evaluated:")
        for s_idx, v_val in enumerate(V):
            print(f"    State {env.idx_to_state[s_idx]}: {v_val:.2f}")

        # 2. Policy Improvement
        new_policy = policy_improvement(V, env)

        print("  New Policy (Greedy):")
        for s_idx in range(env.num_states):
            print(f"    State {env.idx_to_state[s_idx]}: Best Action(s) index(es) = {np.where(new_policy[s_idx] == 1)[0]}")


        if np.array_equal(new_policy, policy):
            print("\nPolicy converged!")
            break
        policy = new_policy

    # Final Q values for the optimal policy
    Q_optimal = calculate_q_from_v(V, env)
    return policy, V, Q_optimal

In [None]:
print("\n\nStarting Policy Iteration")
optimal_policy_pi, V_optimal_pi, Q_optimal_pi = policy_iteration(env, initial_policy=random_policy.copy())

print("\n Results from Policy Iteration")
print("Optimal Policy (PI):")
for s_idx in range(env.num_states):
    best_actions = [env.actions[i] for i, p in enumerate(optimal_policy_pi[s_idx]) if p == 1.0]
    print(f"  State {env.idx_to_state[s_idx]}: Optimal Action(s) = {best_actions}")

print("\nOptimal V(s) (PI):")
for s_idx, v_val in enumerate(V_optimal_pi):
    print(f"  State {env.idx_to_state[s_idx]}: {v_val:.2f}")

print("\nOptimal Q(s,a) (PI):")
for s_idx in range(env.num_states):
    if env.idx_to_state[s_idx] == env.goal_state: continue
    print(f"  State {env.idx_to_state[s_idx]}:")
    for a_idx, q_val in enumerate(Q_optimal_pi[s_idx]):
        print(f"    Action {env.actions[a_idx]}: {q_val:.2f}")



--- Starting Policy Iteration ---

--- Policy Iteration: Iteration 1 ---
  V(s) evaluated:
    State (0, 0): 1.18
    State (0, 1): 3.66
    State (1, 0): 3.66
    State (1, 1): 0.00
  New Policy (Greedy):
    State (0, 0): Best Action(s) index(es) = [1]
    State (0, 1): Best Action(s) index(es) = [1]
    State (1, 0): Best Action(s) index(es) = [3]
    State (1, 1): Best Action(s) index(es) = []

--- Policy Iteration: Iteration 2 ---
  V(s) evaluated:
    State (0, 0): 8.00
    State (0, 1): 10.00
    State (1, 0): 10.00
    State (1, 1): 0.00
  New Policy (Greedy):
    State (0, 0): Best Action(s) index(es) = [1]
    State (0, 1): Best Action(s) index(es) = [1]
    State (1, 0): Best Action(s) index(es) = [3]
    State (1, 1): Best Action(s) index(es) = []

Policy converged!

--- Results from Policy Iteration ---
Optimal Policy (PI):
  State (0, 0): Optimal Action(s) = ['D']
  State (0, 1): Optimal Action(s) = ['D']
  State (1, 0): Optimal Action(s) = ['R']
  State (1, 1): Optimal

In [None]:
def value_iteration(env, theta=1e-6):
    V = np.zeros(env.num_states)
    iteration = 0
    while True:
        iteration += 1
        delta = 0
        print(f"\n--- Value Iteration: Iteration {iteration} ---")
        for s_idx in range(env.num_states):
            if env.idx_to_state[s_idx] == env.goal_state:
                continue
            v_old = V[s_idx]
            action_values = np.zeros(env.num_actions)
            for a_idx in range(env.num_actions):
                next_s_idx, reward, done = env.step(s_idx, a_idx)
                action_values[a_idx] = reward + env.gamma * V[next_s_idx]
            V[s_idx] = np.max(action_values)
            delta = max(delta, abs(v_old - V[s_idx]))

        print("  V(s) updated:")
        for s_idx_print, v_val_print in enumerate(V):
             print(f"    State {env.idx_to_state[s_idx_print]}: {v_val_print:.2f}")

        if delta < theta:
            print("\nValue function converged!")
            break

    # Extract optimal policy
    optimal_policy = np.zeros((env.num_states, env.num_actions))
    for s_idx in range(env.num_states):
        if env.idx_to_state[s_idx] == env.goal_state:
            optimal_policy[s_idx, :] = 1.0 / env.num_actions # Or any action
            continue
        action_values = np.zeros(env.num_actions)
        for a_idx in range(env.num_actions):
            next_s_idx, reward, done = env.step(s_idx, a_idx)
            action_values[a_idx] = reward + env.gamma * V[next_s_idx]
        best_action_idx = np.argmax(action_values)
        optimal_policy[s_idx, best_action_idx] = 1.0

    Q_optimal = calculate_q_from_v(V, env) # Or calculate during the last V iteration
    return optimal_policy, V, Q_optimal


In [None]:



print("\n\n--- Starting Value Iteration ---")
optimal_policy_vi, V_optimal_vi, Q_optimal_vi = value_iteration(env)

print("\n--- Results from Value Iteration ---")
print("Optimal Policy (VI):")
for s_idx in range(env.num_states):
    best_actions = [env.actions[i] for i, p in enumerate(optimal_policy_vi[s_idx]) if p == 1.0]
    print(f"  State {env.idx_to_state[s_idx]}: Optimal Action(s) = {best_actions}")

print("\nOptimal V(s) (VI):")
for s_idx, v_val in enumerate(V_optimal_vi):
    print(f"  State {env.idx_to_state[s_idx]}: {v_val:.2f}")

print("\nOptimal Q(s,a) (VI):")
for s_idx in range(env.num_states):
    if env.idx_to_state[s_idx] == env.goal_state: continue
    print(f"  State {env.idx_to_state[s_idx]}:")
    for a_idx, q_val in enumerate(Q_optimal_vi[s_idx]):
        print(f"    Action {env.actions[a_idx]}: {q_val:.2f}")



--- Starting Value Iteration ---

--- Value Iteration: Iteration 1 ---
  V(s) updated:
    State (0, 0): -1.00
    State (0, 1): 10.00
    State (1, 0): 10.00
    State (1, 1): 0.00

--- Value Iteration: Iteration 2 ---
  V(s) updated:
    State (0, 0): 8.00
    State (0, 1): 10.00
    State (1, 0): 10.00
    State (1, 1): 0.00

--- Value Iteration: Iteration 3 ---
  V(s) updated:
    State (0, 0): 8.00
    State (0, 1): 10.00
    State (1, 0): 10.00
    State (1, 1): 0.00

Value function converged!

--- Results from Value Iteration ---
Optimal Policy (VI):
  State (0, 0): Optimal Action(s) = ['D']
  State (0, 1): Optimal Action(s) = ['D']
  State (1, 0): Optimal Action(s) = ['R']
  State (1, 1): Optimal Action(s) = []

Optimal V(s) (VI):
  State (0, 0): 8.00
  State (0, 1): 10.00
  State (1, 0): 10.00
  State (1, 1): 0.00

Optimal Q(s,a) (VI):
  State (0, 0):
    Action U: 6.20
    Action D: 8.00
    Action L: 6.20
    Action R: 8.00
  State (0, 1):
    Action U: 8.00
    Action D: 