In [54]:
import gymnasium as gym
import numpy as np
import random
from IPython.display import display, HTML


In [55]:
# Creating the Frozen Lake environment
env = gym.make("FrozenLake-v1", render_mode="ansi", is_slippery=True)
# When is_slippery=True, the ice is slippery and actions are stochastic:
 # the agent may slip in a random direction instead of the intended one.
 # This adds uncertainty to the environment and makes learning more challenging 
#and the success rate may be lower trhan when we can control the agent (is_slippery=False)
n_states = env.observation_space.n
n_actions = env.action_space.n

print(f"Environment created: FrozenLake-v1")
print(f"Number of states: {n_states}")
print(f"Number of actions: {n_actions}")


Environment created: FrozenLake-v1
Number of states: 16
Number of actions: 4


In [56]:
# Initializing Q-table and hyperparameters
Q = np.zeros((n_states, n_actions))

alpha = 0.7
gamma = 0.95
epsilon = 1.0
epsilon_min = 0.01
epsilon_decay = 0.9995
episodes = 50000
max_steps = 100



In [57]:
# Training the agent using Q-learning 
import random
from IPython.display import HTML, display

def print_scrollable(text, height=300):
    """Utility to display long logs in a scrollable output cell."""
    display(HTML(f"<div style='max-height:{height}px; overflow-y:auto; border:1px solid #ccc; "
                 f"padding:10px; font-family:monospace; white-space:pre;'>{text}</div>"))

# Hyperparameters
alpha = 0.9          # Learning rate
gamma = 0.95         # Discount factor
epsilon = 1.0        # Exploration rate
epsilon_min = 0.01
epsilon_decay = 0.999
episodes = 10000
max_steps = 100

# Initialize Q-table
Q = np.zeros((env.observation_space.n, env.action_space.n))

log_output = ""

for episode in range(episodes):
    state, _ = env.reset()
    done = False

    for step in range(max_steps):
        # Epsilon-greedy action selection
        if random.uniform(0, 1) < epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(Q[state])

        # Take action
        next_state, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated

        # Q-learning update rule
        Q[state, action] = Q[state, action] + alpha * (
            reward + gamma * np.max(Q[next_state]) - Q[state, action]
        )

        state = next_state
        if done:
            break

    # Decay exploration rate
    epsilon = max(epsilon_min, epsilon * epsilon_decay)

    # Log progress
    if (episode + 1) % 1000 == 0:
        log_output += f"Episode {episode+1}/{episodes} — Epsilon: {epsilon:.3f}\n"

print_scrollable(log_output)


In [58]:
# Evaluating the trained agent
import re

def print_scrollable(text, height=400):
    """Scrollable HTML output cell."""
    display(HTML(
        f"<div style='max-height:{height}px; overflow-y:auto; border:1px solid #ccc; "
        f"padding:10px; font-family:monospace; white-space:pre;'>{text}</div>"
    ))

def remove_ansi(text):
    """Remove ANSI escape codes (color formatting)."""
    ansi_escape = re.compile(r'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])')
    return ansi_escape.sub('', text)

#Evaluation loop 
episodes_eval = 20
total_rewards = 0
total_steps = 0
successes = 0
ansi_logs = ""

for episode in range(episodes_eval):
    state, _ = env.reset()
    done = False
    step_count = 0
    ansi_logs += f"Episode {episode + 1}\n"

    while not done:
        action = np.argmax(Q[state])  # Greedy policy (no exploration)
        next_state, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        ansi_logs += env.render()
        state = next_state
        step_count += 1
        total_rewards += reward

        if done:
            if reward == 1:
                successes += 1
            ansi_logs += f"Reward: {reward}\n{'=' * 40}\n"
            break

    total_steps += step_count

ansi_logs = remove_ansi(ansi_logs)
print_scrollable(ansi_logs, height=400)

In [59]:
#Evaluation Results
print("Evaluation Results")
print(f"Success rate: {successes / episodes_eval * 100:.2f}%")
print(f"Average steps per episode: {total_steps / episodes_eval:.2f}")
print(f"Average reward per episode: {total_rewards / episodes_eval:.2f}")

Evaluation Results
Success rate: 60.00%
Average steps per episode: 25.55
Average reward per episode: 0.60


### **Conclusion**

- When using **`is_slippery=False`**, the environment becomes **deterministic** — every action leads exactly where the agent intends to go.  
  🔹 As a result, the Q-learning agent easily learns the **optimal policy**, achieving a **success rate of 100%** after sufficient training.  

- When using **`is_slippery=True`**, the environment introduces **stochasticity (random slips)**.  
  🔹 The agent’s movements are no longer perfectly predictable, making the learning process harder.  
  🔹 Consequently, even a well-trained agent typically achieves a **success rate between 50% and 70%**, depending on hyperparameters and random outcomes.  

**In summary:**  
> Deterministic environments (`is_slippery=False`) allow perfect control and performance,  
> while stochastic environments (`is_slippery=True`) better simulate real-world uncertainty —  
> leading to more realistic but less consistent results.
