<a href="https://colab.research.google.com/github/MuleHakim/Reinforcement-Learning/blob/main/Value_Iteration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Value Iteration

###  Grid World (FrozenLake Environment)

In [1]:
!pip install gymnasium

Collecting gymnasium
  Downloading gymnasium-0.29.1-py3-none-any.whl (953 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
Collecting farama-notifications>=0.0.1 (from gymnasium)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium
Successfully installed farama-notifications-0.0.4 gymnasium-0.29.1


In [2]:
import gymnasium as gym
import numpy as np


In [3]:
env = gym.make('FrozenLake-v1', is_slippery=False)

In [4]:
def value_iteration(env, gamma=0.99, theta=1e-8):
    value_table = np.zeros(env.observation_space.n)
    while True:
        delta = 0
        for state in range(env.observation_space.n):
            Q_values = [sum([prob * (reward + gamma * value_table[next_state])
                             for prob, next_state, reward, _ in env.P[state][action]])
                        for action in range(env.action_space.n)]
            max_Q_value = max(Q_values)
            delta = max(delta, np.abs(max_Q_value - value_table[state]))
            value_table[state] = max_Q_value
        if delta < theta:
            break
    policy = np.zeros(env.observation_space.n, dtype=int)
    for state in range(env.observation_space.n):
        Q_values = [sum([prob * (reward + gamma * value_table[next_state])
                         for prob, next_state, reward, _ in env.P[state][action]])
                    for action in range(env.action_space.n)]
        policy[state] = np.argmax(Q_values)
    return policy, value_table

In [5]:
policy, value_table = value_iteration(env)

  logger.warn(


In [6]:
print("Optimal Policy:", policy)

Optimal Policy: [1 2 1 0 1 0 1 0 2 1 1 0 0 2 2 0]


In [7]:
print("Value Table:", value_table)

Value Table: [0.95099005 0.96059601 0.970299   0.96059601 0.96059601 0.
 0.9801     0.         0.970299   0.9801     0.99       0.
 0.         0.99       1.         0.        ]


### Single-State Multi-Armed Bandit

In [8]:
def value_iteration_bandit(k, gamma=0.99, theta=1e-8, max_steps=1000):
    # Initialize value table for each arm
    value_table = np.zeros(k)
    rewards = np.random.randn(k)  # Assume normal distribution with unit variance
    for step in range(max_steps):
        for action in range(k):
            value_table[action] = (1 / (step + 1)) * (rewards[action] + gamma * max(value_table))
    return value_table


In [9]:
k = 10  # Number of arms

In [10]:
value_table = value_iteration_bandit(k)

In [11]:
print("Value Table for Bandit:", value_table)

Value Table for Bandit: [-0.00038309 -0.00063805  0.00087915 -0.00021576  0.00019603 -0.00052959
  0.0009182   0.00032485 -0.00090633 -0.00233692]
