<a href="https://colab.research.google.com/github/MuleHakim/Reinforcement-Learning/blob/main/Epsilon_Greedy_Policy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 4. Epsilon-Greedy Policy

### Grid World (FrozenLake Environment)


In [1]:
!pip install gymnasium

Collecting gymnasium
  Downloading gymnasium-0.29.1-py3-none-any.whl (953 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
Collecting farama-notifications>=0.0.1 (from gymnasium)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium
Successfully installed farama-notifications-0.0.4 gymnasium-0.29.1


In [2]:
import gymnasium as gym
import numpy as np
import random

In [3]:
# Initialize the FrozenLake environment
env = gym.make('FrozenLake-v1', is_slippery=False)

In [4]:
# Initialize the Q-table with zeros
q_table = np.zeros((env.observation_space.n, env.action_space.n))
# q_table = np.random.rand(env.observation_space.n, env.action_space.n)
alpha = 0.1  # Learning rate
gamma = 0.99  # Discount factor
epsilon = 0.5  # Exploration rate
num_episodes = 50000
max_steps = 100

In [5]:
# Training the agent using epsilon-greedy policy
for episode in range(num_episodes):
    state = env.reset()[0]
    done = False
    for step in range(max_steps):
        if random.uniform(0, 1) < epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(q_table[state, :])

        new_state, reward, done, _, _ = env.step(action)

        q_table[state, action] = q_table[state, action] + alpha * (
                    reward + gamma * np.max(q_table[new_state, :]) - q_table[state, action])

        state = new_state

        if done:
            break

In [6]:
print("Q-Table using Epsilon-Greedy:", q_table)


Q-Table using Epsilon-Greedy: [[0.94148015 0.95099005 0.95099005 0.94148015]
 [0.94148015 0.         0.96059601 0.95099005]
 [0.95099005 0.970299   0.95099005 0.96059601]
 [0.96059601 0.         0.95099005 0.95099005]
 [0.95099005 0.96059601 0.         0.94148015]
 [0.         0.         0.         0.        ]
 [0.         0.9801     0.         0.96059601]
 [0.         0.         0.         0.        ]
 [0.96059601 0.         0.970299   0.95099005]
 [0.96059601 0.9801     0.9801     0.        ]
 [0.970299   0.99       0.         0.970299  ]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.         0.9801     0.99       0.970299  ]
 [0.9801     0.99       1.         0.9801    ]
 [0.         0.         0.         0.        ]]


### Single-State Multi-Armed Bandit


In [7]:
k = 10  # Number of arms
q_table = np.zeros(k)
alpha = 0.1  # Learning rate
gamma = 0.99  # Discount factor
epsilon = 0.1  # Exploration rate
num_steps = 1000

In [8]:
# Simulate the epsilon-greedy policy
for step in range(num_steps):
    if random.uniform(0, 1) < epsilon:
        action = random.randint(0, k-1)
    else:
        action = np.argmax(q_table)

    reward = np.random.normal(action, 1.0)  # Assume normal distribution with unit variance

    q_table[action] = q_table[action] + alpha * (reward + gamma * np.max(q_table) - q_table[action])

In [9]:
print("Q-Table using Epsilon-Greedy for Bandit:", q_table)


Q-Table using Epsilon-Greedy for Bandit: [ 83.40575822  89.14516235  94.61767988  84.43182954 236.50320981
 107.77731379 117.39320702 135.7021066   55.74063539 118.81586401]
