<a href="https://colab.research.google.com/github/MuleHakim/Reinforcement-Learning/blob/main/Policy_Iteration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Pollicy Iteration

### Grid World (FrozenLake Environment)

In [1]:
!pip install gymnasium

Collecting gymnasium
  Downloading gymnasium-0.29.1-py3-none-any.whl (953 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/953.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━[0m [32m593.9/953.9 kB[0m [31m18.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
Collecting farama-notifications>=0.0.1 (from gymnasium)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium
Successfully installed farama-notifications-0.0.4 gymnasium-0.29.1


In [2]:
import gymnasium as gym
import numpy as np

In [3]:
# Initialize the FrozenLake environment
env = gym.make('FrozenLake-v1', is_slippery=False)

In [4]:
def policy_evaluation(policy, env, gamma=0.99, theta=1e-8):
    # Initialize the value table with zeros
    value_table = np.zeros(env.observation_space.n)

    while True:
        delta = 0
        # Iterate over all states in the environment
        for state in range(env.observation_space.n):
            action = policy[state]
            # Compute the value of the current policy
            value = sum([prob * (reward + gamma * value_table[next_state])
                         for prob, next_state, reward, _ in env.P[state][action]])
            # Update delta to check convergence
            delta = max(delta, np.abs(value - value_table[state]))
            # Update the value table
            value_table[state] = value

        # Check if the values have converged
        if delta < theta:
            break

    return value_table


In [5]:
def policy_iteration(env, gamma=0.99, theta=1e-8):
    # Initialize a random policy
    policy = np.random.choice(env.action_space.n, env.observation_space.n)

    while True:
        # Evaluate the current policy
        value_table = policy_evaluation(policy, env, gamma, theta)
        policy_stable = True

        # Iterate over all states to improve the policy
        for state in range(env.observation_space.n):
            old_action = policy[state]
            # Calculate Q-value for each action in the current state
            Q_values = [sum([prob * (reward + gamma * value_table[next_state])
                             for prob, next_state, reward, _ in env.P[state][action]])
                        for action in range(env.action_space.n)]
            # Choose the action with the highest Q-value
            new_action = np.argmax(Q_values)
            if old_action != new_action:
                policy_stable = False
            policy[state] = new_action

        # If the policy is stable, break the loop
        if policy_stable:
            break

    return policy, value_table

In [6]:
policy, value_table = policy_iteration(env)

  logger.warn(


In [7]:
print("Optimal Policy:", policy)

Optimal Policy: [1 2 1 0 1 0 1 0 2 1 1 0 0 2 2 0]


In [8]:
print("Value Table:", value_table)

Value Table: [0.95099005 0.96059601 0.970299   0.96059601 0.96059601 0.
 0.9801     0.         0.970299   0.9801     0.99       0.
 0.         0.99       1.         0.        ]


### Single-State Multi-Armed Bandit


In [9]:
def policy_iteration_bandit(k, gamma=0.99, theta=1e-8, max_steps=1000):
    policy = np.random.choice(k)
    value_table = np.zeros(k)
    rewards = np.random.randn(k)  # Assume normal distribution with unit variance

    while True:
        old_policy = policy

        # Policy evaluation
        for step in range(max_steps):
            new_value_table = np.copy(value_table)
            for action in range(k):
                new_value_table[action] = rewards[action] + gamma * np.max(value_table)
            if np.max(np.abs(new_value_table - value_table)) < theta:
                break
            value_table = new_value_table

        # Policy improvement
        policy = np.argmax(value_table)

        if policy == old_policy:
            break

    return policy, value_table

In [10]:
k = 10  # Number of arms

In [11]:
policy, value_table = policy_iteration_bandit(k)

In [12]:
print("Optimal Policy for Bandit:", policy)

Optimal Policy for Bandit: 7


In [13]:
print("Value Table for Bandit:", value_table)

Value Table for Bandit: [210.18299157 209.98884411 210.33777636 209.85741078 209.17872026
 207.43406794 211.05546753 211.99305777 208.93840636 210.16512246]
