<a href="https://colab.research.google.com/github/Mariihmp/RL_Notebooks/blob/main/Q_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import sys
import numpy as np
import random
project_folder_path = '/content/drive/MyDrive/session-3'
sys.path.append(project_folder_path)



In [None]:
from GridWorld_env import GridWorld
env = GridWorld()

In [None]:

def q_learning(env, num_episodes, alpha=0.1, epsilon_start=1.0, epsilon_end=0.01, epsilon_decay_rate=0.999):
    Q = np.zeros((env.num_states, env.num_actions))

    goal_s_idx = env.state_to_idx[env.goal_state]
    Q[goal_s_idx, :] = 0

    epsilon = epsilon_start

    print(f"\nStarting Q-learning for {num_episodes} episodes")
    print(f"Alpha={alpha}, Epsilon_start={epsilon_start}, Epsilon_end={epsilon_end}, Epsilon_decay={epsilon_decay_rate}")


    for i_episode in range(num_episodes):
        if (i_episode + 1) % (num_episodes // 20 if num_episodes >=20 else 1) == 0:
            current_v_for_start = np.max(Q[env.state_to_idx[env.start_state]]) if env.state_to_idx[env.start_state] < Q.shape[0] else 0
            print(f"Episode {i_episode + 1}/{num_episodes}, Epsilon: {epsilon:.3f}, Q(start, max_a): {current_v_for_start:.2f}")


        state_idx = env.reset()
        done = False
        max_steps_per_episode = 100
        steps = 0

        while not done and steps < max_steps_per_episode:
            if env.idx_to_state[state_idx] == env.goal_state:
                break

            # epsilon-greedy action selection
            if random.uniform(0, 1) < epsilon:
                action_idx = random.choice(range(env.num_actions))
            else:
                action_idx = np.argmax(Q[state_idx])
                best_actions = np.flatnonzero(Q[state_idx] == np.max(Q[state_idx]))
                action_idx = np.random.choice(best_actions)


            next_state_idx, reward, done = env.step(state_idx, action_idx)

            # Q-learning update
            if done:
                td_target = reward
                Q[state_idx, action_idx] += alpha * (td_target - Q[state_idx, action_idx])
            else:
                best_next_action_q = np.max(Q[next_state_idx])
                td_target = reward + env.gamma * best_next_action_q
                Q[state_idx, action_idx] += alpha * (td_target - Q[state_idx, action_idx])

            Q[goal_s_idx, :] = 0

            state_idx = next_state_idx
            steps += 1

        epsilon = max(epsilon_end, epsilon * epsilon_decay_rate)


    # Extract optimal policy from Q*
    optimal_policy_qlearning = np.zeros((env.num_states, env.num_actions))
    for s_idx in range(env.num_states):
        if env.idx_to_state[s_idx] == env.goal_state:
            optimal_policy_qlearning[s_idx, :] = 1.0 / env.num_actions
            continue
        best_action = np.argmax(Q[s_idx])
        optimal_policy_qlearning[s_idx, best_action] = 1.0

    V_optimal_qlearning = np.max(Q, axis=1)
    V_optimal_qlearning[goal_s_idx] = 0

    return optimal_policy_qlearning, V_optimal_qlearning, Q


In [None]:
num_q_episodes = 5000
alpha_q = 0.1 # Learning rate

#ep parmas
epsilon_start_q = 1.0
epsilon_end_q = 0.05
epsilon_decay_q = 0.999 # Decays epsilon over episodes


optimal_policy_q, V_optimal_q, Q_star_q = q_learning(env,
                                                      num_q_episodes,
                                                      alpha=alpha_q,
                                                      epsilon_start=epsilon_start_q,
                                                      epsilon_end=epsilon_end_q,
                                                      epsilon_decay_rate=epsilon_decay_q)

print("\nResults from Q-learning")
print("Optimal Policy (Q-learning):")
for s_idx in range(env.num_states):
    best_actions_q = [env.actions[i] for i, p in enumerate(optimal_policy_q[s_idx]) if p == 1.0]
    print(f"  State {env.idx_to_state[s_idx]}: Optimal Action(s) = {best_actions_q}")

print("\nOptimal V(s) (Q-learning, from Q*):")
for s_idx, v_val in enumerate(V_optimal_q):
    print(f"  State {env.idx_to_state[s_idx]}: {v_val:.2f}")

print("\nOptimal Q*(s,a) (Q-learning):")
for s_idx in range(env.num_states):
    if env.idx_to_state[s_idx] == env.goal_state:
        print(f"  State {env.idx_to_state[s_idx]} (Goal): Q values are 0.00")
        continue
    print(f"  State {env.idx_to_state[s_idx]}:")
    for a_idx, q_val in enumerate(Q_star_q[s_idx]):
        print(f"    Action {env.actions[a_idx]}: {q_val:.2f}")

print("\nFor comparison, Q*(s,a) from Value Iteration (should be similar):")
for s_idx in range(env.num_states):
    if env.idx_to_state[s_idx] == env.goal_state:
        print(f"  State {env.idx_to_state[s_idx]} (Goal): Q values are 0.00")
        continue
    print(f"  State {env.idx_to_state[s_idx]}:")

    #import Q_optimal_vi
    # for a_idx, q_val in enumerate(Q_optimal_vi[s_idx]):
    #     print(f"    Action {env.actions[a_idx]}: {q_val:.2f}")


Starting Q-learning for 5000 episodes
Alpha=0.1, Epsilon_start=1.0, Epsilon_end=0.05, Epsilon_decay=0.999
Episode 250/5000, Epsilon: 0.779, Q(start, max_a): 8.00
Episode 500/5000, Epsilon: 0.607, Q(start, max_a): 8.00
Episode 750/5000, Epsilon: 0.473, Q(start, max_a): 8.00
Episode 1000/5000, Epsilon: 0.368, Q(start, max_a): 8.00
Episode 1250/5000, Epsilon: 0.287, Q(start, max_a): 8.00
Episode 1500/5000, Epsilon: 0.223, Q(start, max_a): 8.00
Episode 1750/5000, Epsilon: 0.174, Q(start, max_a): 8.00
Episode 2000/5000, Epsilon: 0.135, Q(start, max_a): 8.00
Episode 2250/5000, Epsilon: 0.105, Q(start, max_a): 8.00
Episode 2500/5000, Epsilon: 0.082, Q(start, max_a): 8.00
Episode 2750/5000, Epsilon: 0.064, Q(start, max_a): 8.00
Episode 3000/5000, Epsilon: 0.050, Q(start, max_a): 8.00
Episode 3250/5000, Epsilon: 0.050, Q(start, max_a): 8.00
Episode 3500/5000, Epsilon: 0.050, Q(start, max_a): 8.00
Episode 3750/5000, Epsilon: 0.050, Q(start, max_a): 8.00
Episode 4000/5000, Epsilon: 0.050, Q(star