<a href="https://colab.research.google.com/github/MuleHakim/Reinforcement-Learning/blob/main/Q_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 3. Q-Learning


### Grid World (FrozenLake Environment)


In [1]:
!pip install gymnasium

Collecting gymnasium
  Downloading gymnasium-0.29.1-py3-none-any.whl (953 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
Collecting farama-notifications>=0.0.1 (from gymnasium)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium
Successfully installed farama-notifications-0.0.4 gymnasium-0.29.1


In [2]:
import gymnasium as gym
import numpy as np
import random

In [3]:
# Initialize the FrozenLake environment
env = gym.make('FrozenLake-v1', is_slippery=False)

In [4]:
# Initialize the Q-table with zeros
q_table = np.zeros((env.observation_space.n, env.action_space.n))
alpha = 0.1  # Learning rate
gamma = 0.99  # Discount factor
epsilon = 1.0  # Exploration rate
max_epsilon = 1.0
min_epsilon = 0.01
decay_rate = 0.001
num_episodes = 10000
max_steps = 100

In [5]:
# Training the agent
for episode in range(num_episodes):
    state = env.reset()[0]  # Ensure correct state reset
    done = False
    for step in range(max_steps):
        # Exploration-exploitation trade-off
        if random.uniform(0, 1) < epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(q_table[state, :])

        # Take the action and observe the outcome
        new_state, reward, done, _, _ = env.step(action)

        # Handle the sparse reward structure
        if done and reward == 0:
            reward = -1  # Penalize falling into a hole or reaching max_steps without success

        # Update the Q-value
        q_table[state, action] = q_table[state, action] + alpha * (
                    reward + gamma * np.max(q_table[new_state, :]) - q_table[state, action])

        state = new_state

        if done:
            break

    # Decay the exploration rate
    epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate * episode)

In [6]:
print("Q-Table:", q_table)

Q-Table: [[ 0.94148015  0.95099005  0.93206534  0.94148015]
 [ 0.94148015 -0.99995002  0.71378428  0.88704532]
 [ 0.35949622  0.92401454  0.05190564  0.2621459 ]
 [ 0.18233602 -0.83322818  0.00364195  0.01244775]
 [ 0.95099005  0.96059601 -1.          0.94148015]
 [ 0.          0.          0.          0.        ]
 [-0.96184796  0.9800992  -0.95289871  0.58309734]
 [ 0.          0.          0.          0.        ]
 [ 0.96059601 -1.          0.970299    0.95099005]
 [ 0.960596    0.98009999  0.9801     -1.        ]
 [ 0.97029885  0.99       -1.          0.97028972]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [-0.94766524  0.9616406   0.99        0.94511384]
 [ 0.98009991  0.98999982  1.          0.98009997]
 [ 0.          0.          0.          0.        ]]


In [7]:
# Extracting the optimal policy from the Q-table
optimal_policy = np.argmax(q_table, axis=1)
actions = ['Left', 'Down', 'Right', 'Up']
optimal_policy_named = [actions[action] for action in optimal_policy]

for state in range(env.observation_space.n):
    print(f"State {state}: {optimal_policy_named[state]}")

State 0: Down
State 1: Left
State 2: Down
State 3: Left
State 4: Down
State 5: Left
State 6: Down
State 7: Left
State 8: Right
State 9: Right
State 10: Down
State 11: Left
State 12: Left
State 13: Right
State 14: Right
State 15: Left


### Single-State Multi-Armed Bandit


In [8]:
k = 10  # Number of arms
q_table = np.zeros(k)
alpha = 0.1  # Learning rate
gamma = 0.99  # Discount factor
epsilon = 1.0  # Exploration rate
max_epsilon = 1.0
min_epsilon = 0.01
decay_rate = 0.01
num_steps = 1000

In [9]:
# Simulate the Q-learning process
for step in range(num_steps):
    if random.uniform(0, 1) < epsilon:
        action = random.randint(0, k-1)
    else:
        action = np.argmax(q_table)

    reward = np.random.normal(action, 1.0)  # Assume normal distribution with unit variance

    # Update the Q-value
    q_table[action] = q_table[action] + alpha * (reward + gamma * np.max(q_table) - q_table[action])

    # Decay the exploration rate
    epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate * step)


In [10]:
print("Q-Table for Bandit:", q_table)

Q-Table for Bandit: [113.17730386  92.32205897  70.63366647  74.7474042   29.24821495
  27.54689897  13.49417317   9.72042205 474.91141384   9.46294214]


In [11]:
# Identify the best action
best_action = np.argmax(q_table)
print("Best Action:", best_action, "with Q-Value:", q_table[best_action])


Best Action: 8 with Q-Value: 474.91141383512905


In [12]:
# Print all actions and their Q-values
for action in range(k):
    print(f"Action {action}: Q-Value = {q_table[action]}")

Action 0: Q-Value = 113.17730385990268
Action 1: Q-Value = 92.32205896740714
Action 2: Q-Value = 70.63366647128467
Action 3: Q-Value = 74.74740419900677
Action 4: Q-Value = 29.24821495147941
Action 5: Q-Value = 27.546898967946362
Action 6: Q-Value = 13.494173171899911
Action 7: Q-Value = 9.720422047512386
Action 8: Q-Value = 474.91141383512905
Action 9: Q-Value = 9.462942139934263
