<a href="https://colab.research.google.com/github/KijoSal-dev/RL-example/blob/main/example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# Import the gym library for the environment and numpy for numerical operations.
import gym
import numpy as np

# Create the FrozenLake environment.
# 'is_slippery=False' makes the environment deterministic (simpler for demonstration).
# Using new_step_api=True for compatibility with newer gym versions.
env = gym.make("FrozenLake-v1", is_slippery=False, new_step_api=True)

# Initialize the Q-table with zeros.
# Rows correspond to states and columns to possible actions.
Q_table = np.zeros((env.observation_space.n, env.action_space.n))

# Set hyperparameters for the Q-learning algorithm.
n_episodes = 2000       # Total episodes to run the training.
max_steps = 100         # Maximum steps per episode.
learning_rate = 0.8     # Learning rate to control update magnitude.
discount_factor = 0.95  # Discount factor for future rewards.
epsilon = 1.0           # Exploration rate (starting value).
max_epsilon = 1.0       # Maximum exploration probability.
min_epsilon = 0.01      # Minimum exploration probability.
decay_rate = 0.005      # Exponential decay rate for epsilon.

# Q-learning algorithm:
for episode in range(n_episodes):
    # Reset environment for a new episode with new_step_api
    # Assign the result of env.reset() to a variable and then access state and info.
    reset_result = env.reset()
    state = reset_result[0] if isinstance(reset_result, tuple) else reset_result
    info = reset_result[1] if isinstance(reset_result, tuple) and len(reset_result) > 1 else {}

    done = False         # Indicator whether the episode is finished.

    for step in range(max_steps):
        # Generate a random number to decide whether to explore or exploit.
        exp_exp_tradeoff = np.random.uniform(0, 1)

        # Epsilon-greedy strategy:
        # If the random number is greater than epsilon, we exploit (choose best action).
        # Otherwise, we explore by selecting a random action.
        if exp_exp_tradeoff > epsilon:
            action = np.argmax(Q_table[state, :])  # Best known action.
        else:
            action = env.action_space.sample()      # Random action.

        # Take the chosen action, and observe the next state, reward, and if the episode is done.
        new_state, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated

        # Update the Q-table using the Q-learning update rule:
        # Q(s, a) = Q(s, a) + learning_rate * [reward + discount * max(Q(new_state)) - Q(s, a)]
        Q_table[state, action] = Q_table[state, action] + learning_rate * (
            reward + discount_factor * np.max(Q_table[new_state, :]) - Q_table[state, action]
        )

        # Transition to the new state.
        state = new_state

        # If the episode is finished (reached a terminal state), exit the loop.
        if done:
            break

    # Decay epsilon after each episode.
    # This gradually reduces exploration as the agent learns more about the environment.
    epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate * episode)

# After training, print the final Q-table.
print("Trained Q-table:")
print(Q_table)

# --- Testing the trained agent ---
# Reset the environment for testing with new_step_api
reset_result = env.reset()
state = reset_result[0] if isinstance(reset_result, tuple) else reset_result
info = reset_result[1] if isinstance(reset_result, tuple) and len(reset_result) > 1 else {}
total_reward = 0     # Initialize reward accumulator.
for step in range(max_steps):
    # Always select the best action based on the current Q-table (exploitation).
    action = np.argmax(Q_table[state, :])
    new_state, reward, terminated, truncated, info = env.step(action)
    done = terminated or truncated
    total_reward += reward  # Accumulate the reward.
    state = new_state       # Move to the next state.
    if done:
        break

print("Total reward earned during test:", total_reward)

  if not isinstance(terminated, (bool, np.bool8)):


Trained Q-table:
[[0.73509189 0.77378094 0.69819565 0.73509189]
 [0.7350869  0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.77378094 0.81450625 0.         0.73509189]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.81450617 0.         0.857375   0.77378093]
 [0.81450625 0.9025     0.81450625 0.        ]
 [0.857375   0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.         0.9025     0.95       0.857375  ]
 [0.90249954 0.94998784 1.         0.81450625]
 [0.         0.         0.         0.        ]]
Total reward earned during test: 1.0


In [4]:
%pip install numpy==1.24.0

