In [2]:
!pip install gymnasium[all]

Collecting box2d-py==2.3.5 (from gymnasium[all])
  Using cached box2d-py-2.3.5.tar.gz (374 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting swig==4.* (from gymnasium[all])
  Using cached swig-4.3.1-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (3.5 kB)
Collecting mujoco-py<2.2,>=2.1 (from gymnasium[all])
  Using cached mujoco_py-2.1.2.14-py3-none-any.whl.metadata (669 bytes)
Collecting cython<3 (from gymnasium[all])
  Using cached Cython-0.29.37-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl.metadata (3.1 kB)
Collecting mujoco>=2.1.5 (from gymnasium[all])
  Using cached mujoco-3.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (44 kB)
Collecting glfw (from mujoco>=2.1.5->gymnasium[all])
  Using cached glfw-2.9.0-py2.py27.py3.py30.py31.py32.py33.py34.py35.py36.py37.py38.p39.p310.p311.p312.p313-none-manylinux_2_28_x86_64.whl.metadata (5.4 kB)
Collecting fasteners~=0.15 (from mujoco-py<2.2,>=2.1

In [3]:
import gymnasium as gym
import numpy as np

In [4]:
# Create FrozenLake environment (4x4 with slippery tiles)
env = gym.make("FrozenLake-v1", is_slippery=True)
n_actions = env.action_space.n
n_states = env.observation_space.n

print(f"Number of states: {n_states}")
print(f"Number of actions: {n_actions}")

Number of states: 16
Number of actions: 4


In [5]:
# Q-learning parameters
Q = np.zeros((n_states, n_actions))
lr = 0.8
gamma = 0.95
episodes = 2000


In [6]:
for episode in range(episodes):
    state, _ = env.reset()
    done = False
    while not done:
        # Add noise for exploration
        action = np.argmax(Q[state] + np.random.randn(1, n_actions) * (1. / (episode + 1)))
        new_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated

        # Update Q-table
        Q[state, action] += lr * (reward + gamma * np.max(Q[new_state]) - Q[state, action])
        state = new_state


In [7]:
print("Trained Q-table:")
print(Q)


Trained Q-table:
[[1.03908686e-01 8.32207428e-03 7.07515848e-03 6.23802887e-03]
 [7.79828672e-04 5.16035425e-05 2.84288668e-04 1.03228989e-01]
 [7.40731367e-03 3.90385060e-03 3.11017979e-03 1.63147112e-01]
 [8.50533791e-05 3.21756543e-03 7.36179746e-04 4.94965380e-02]
 [5.93158936e-02 2.04596596e-03 1.36843134e-03 1.43969077e-03]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [3.97143285e-02 6.07940025e-05 3.44989345e-04 1.80151224e-06]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [6.38971942e-05 2.78116848e-04 1.16293475e-03 9.93615767e-02]
 [0.00000000e+00 5.93159628e-01 1.89483641e-03 8.52681546e-04]
 [3.23464396e-01 2.67480494e-04 5.99821041e-04 3.30745283e-04]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [1.13247624e-03 0.00000000e+00 5.90874070e-01 5.39867832e-03]
 [0.00000000e+00 0.00000000e+00 9.24186898e-01 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.0000

In [8]:
successes = 0
total_episodes = 100

for episode in range(total_episodes):
    state, _ = env.reset()
    done = False
    while not done:
        action = np.argmax(Q[state])
        new_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        state = new_state
        if done and reward == 1.0:
            successes += 1

In [9]:
print(f"Success rate over {total_episodes} episodes: {successes}%")

Success rate over 100 episodes: 62%
